Total coverage: 405885 (24%)of 1756392
41 41 41 41 41 40 41 41 7 7 7 7 7 1 1 1 1 1 1 1 58 58 58 50 42 50 42 41 15 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 // SPDX-License-Identifier: GPL-2.0 /* * FUSE inode io modes. * * Copyright (c) 2024 CTERA Networks. */ #include "fuse_i.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/fs.h> /* * Return true if need to wait for new opens in caching mode. */ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) { return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); } /* * Called on cached file open() and on first mmap() of direct_io file. * Takes cached_io inode mode reference to be dropped on file release. * * Blocks new parallel dio writes and waits for the in-progress parallel dio * writes to complete. */ int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); /* There are no io modes if server does not implement open */ if (!ff->args) return 0; spin_lock(&fi->lock); /* * Setting the bit advises new direct-io writes to use an exclusive * lock - without it the wait below might be forever. */ while (fuse_is_io_cache_wait(fi)) { set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); spin_lock(&fi->lock); } /* * Check if inode entered passthrough io mode while waiting for parallel * dio write completion. */ if (fuse_inode_backing(fi)) { clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); return -ETXTBSY; } WARN_ON(ff->iomode == IOM_UNCACHED); if (ff->iomode == IOM_NONE) { ff->iomode = IOM_CACHED; if (fi->iocachectr == 0) set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); fi->iocachectr++; } spin_unlock(&fi->lock); return 0; } static void fuse_file_cached_io_release(struct fuse_file *ff, struct fuse_inode *fi) { spin_lock(&fi->lock); WARN_ON(fi->iocachectr <= 0); WARN_ON(ff->iomode != IOM_CACHED); ff->iomode = IOM_NONE; fi->iocachectr--; if (fi->iocachectr == 0) clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); spin_unlock(&fi->lock); } /* Start strictly uncached io mode where cache access is not allowed */ int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) { struct fuse_backing *oldfb; int err = 0; spin_lock(&fi->lock); /* deny conflicting backing files on same fuse inode */ oldfb = fuse_inode_backing(fi); if (fb && oldfb && oldfb != fb) { err = -EBUSY; goto unlock; } if (fi->iocachectr > 0) { err = -ETXTBSY; goto unlock; } fi->iocachectr--; /* fuse inode holds a single refcount of backing file */ if (fb && !oldfb) { oldfb = fuse_inode_backing_set(fi, fb); WARN_ON_ONCE(oldfb != NULL); } else { fuse_backing_put(fb); } unlock: spin_unlock(&fi->lock); return err; } /* Takes uncached_io inode mode reference to be dropped on file release */ static int fuse_file_uncached_io_open(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); int err; err = fuse_inode_uncached_io_start(fi, fb); if (err) return err; WARN_ON(ff->iomode != IOM_NONE); ff->iomode = IOM_UNCACHED; return 0; } void fuse_inode_uncached_io_end(struct fuse_inode *fi) { struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); fi->iocachectr++; if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); oldfb = fuse_inode_backing_set(fi, NULL); } spin_unlock(&fi->lock); if (oldfb) fuse_backing_put(oldfb); } /* Drop uncached_io reference from passthrough open */ static void fuse_file_uncached_io_release(struct fuse_file *ff, struct fuse_inode *fi) { WARN_ON(ff->iomode != IOM_UNCACHED); ff->iomode = IOM_NONE; fuse_inode_uncached_io_end(fi); } /* * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write * operations go directly to the server, but mmap is done on the backing file. * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. */ #define FOPEN_PASSTHROUGH_MASK \ (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ FOPEN_NOFLUSH) static int fuse_file_passthrough_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_backing *fb; int err; /* Check allowed conditions for file open in passthrough mode */ if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; fb = fuse_passthrough_open(file, inode, ff->args->open_outarg.backing_id); if (IS_ERR(fb)) return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ err = fuse_file_uncached_io_open(inode, ff, fb); if (!err) return 0; fuse_passthrough_release(ff, fb); fuse_backing_put(fb); return err; } /* Request access to submit new io to inode via open file */ int fuse_file_io_open(struct file *file, struct inode *inode) { struct fuse_file *ff = file->private_data; struct fuse_inode *fi = get_fuse_inode(inode); int err; /* * io modes are not relevant with DAX and with server that does not * implement open. */ if (FUSE_IS_DAX(inode) || !ff->args) return 0; /* * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode * which is already open for passthrough. */ err = -EINVAL; if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) goto fail; /* * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. */ if (!(ff->open_flags & FOPEN_DIRECT_IO)) ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; /* * First passthrough file open denies caching inode io mode. * First caching file open enters caching inode io mode. * * Note that if user opens a file open with O_DIRECT, but server did * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, * so we put the inode in caching mode to prevent parallel dio. */ if ((ff->open_flags & FOPEN_DIRECT_IO) && !(ff->open_flags & FOPEN_PASSTHROUGH)) return 0; if (ff->open_flags & FOPEN_PASSTHROUGH) err = fuse_file_passthrough_open(inode, file); else err = fuse_file_cached_io_open(inode, ff); if (err) goto fail; return 0; fail: pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", ff->open_flags, err); /* * The file open mode determines the inode io mode. * Using incorrect open mode is a server mistake, which results in * user visible failure of open() with EIO error. */ return -EIO; } /* No more pending io and no new io possible to inode via open/mmapped file */ void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); /* * Last passthrough file close allows caching inode io mode. * Last caching file close exits caching inode io mode. */ switch (ff->iomode) { case IOM_NONE: /* Nothing to do */ break; case IOM_UNCACHED: fuse_file_uncached_io_release(ff, fi); break; case IOM_CACHED: fuse_file_cached_io_release(ff, fi); break; } }
125 1 1 1 180 1 184 4 38 12 26 151 52 107 68 69 45 45 45 110 110 128 84 84 45 156 155 155 183 156 152 156 38 156 124 124 4 4 131 131 37 37 34 34 3 106 96 96 51 45 2 43 83 6 3 3 3 2 2 1 3 3 131 82 82 128 128 128 45 10 35 111 3 13 83 1 83 4 3 3 1 1 1 125 125 124 116 116 38 116 37 2 35 116 35 35 35 12 35 232 38 38 12 12 35 35 35 232 232 8 223 86 105 179 110 69 239 240 8 241 234 26 26 241 26 17 224 79 74 3 72 86 176 5 4 172 172 145 154 89 139 139 139 138 138 166 166 162 154 154 114 119 5 35 153 8 154 89 152 150 153 83 100 17 17 4 3 2 1 2 16 17 6 6 1 1 1 6 4 4 18 6 12 19 3 19 3 18 18 18 19 19 78 1 78 4 4 4 4 2 4 2 2 4 78 77 76 77 78 87 11 87 4 87 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/hashtable.h> #include <linux/io_uring.h> #include <trace/events/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "alloc_cache.h" #include "refs.h" #include "napi.h" #include "opdef.h" #include "kbuf.h" #include "poll.h" #include "cancel.h" struct io_poll_update { struct file *file; u64 old_user_data; u64 new_user_data; __poll_t events; bool update_events; bool update_user_data; }; struct io_poll_table { struct poll_table_struct pt; struct io_kiocb *req; int nr_entries; int error; bool owning; /* output value, set only if arm poll returns >0 */ __poll_t result_mask; }; #define IO_POLL_CANCEL_FLAG BIT(31) #define IO_POLL_RETRY_FLAG BIT(30) #define IO_POLL_REF_MASK GENMASK(29, 0) /* * We usually have 1-2 refs taken, 128 is more than enough and we want to * maximise the margin between this amount and the moment when it overflows. */ #define IO_POLL_REF_BIAS 128 #define IO_WQE_F_DOUBLE 1 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key); static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE); } static inline bool wqe_is_double(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; return priv & IO_WQE_F_DOUBLE; } static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) { int v; /* * poll_refs are already elevated and we don't have much hope for * grabbing the ownership. Instead of incrementing set a retry flag * to notify the loop that there might have been some change. */ v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs); if (v & IO_POLL_REF_MASK) return false; return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can * bump it and acquire ownership. It's disallowed to modify requests while not * owning it, that prevents from races for enqueueing task_work's and b/w * arming poll and wakeups. */ static inline bool io_poll_get_ownership(struct io_kiocb *req) { if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) return io_poll_get_ownership_slowpath(req); return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } static void io_poll_mark_cancelled(struct io_kiocb *req) { atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); } static struct io_poll *io_poll_get_double(struct io_kiocb *req) { /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ if (req->opcode == IORING_OP_POLL_ADD) return req->async_data; return req->apoll->double_poll; } static struct io_poll *io_poll_get_single(struct io_kiocb *req) { if (req->opcode == IORING_OP_POLL_ADD) return io_kiocb_to_cmd(req, struct io_poll); return &req->apoll->poll; } static void io_poll_req_insert(struct io_kiocb *req) { struct io_hash_table *table = &req->ctx->cancel_table; u32 index = hash_long(req->cqe.user_data, table->hash_bits); struct io_hash_bucket *hb = &table->hbs[index]; spin_lock(&hb->lock); hlist_add_head(&req->hash_node, &hb->list); spin_unlock(&hb->lock); } static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) { struct io_hash_table *table = &req->ctx->cancel_table; u32 index = hash_long(req->cqe.user_data, table->hash_bits); spinlock_t *lock = &table->hbs[index].lock; spin_lock(lock); hash_del(&req->hash_node); spin_unlock(lock); } static void io_poll_req_insert_locked(struct io_kiocb *req) { struct io_hash_table *table = &req->ctx->cancel_table_locked; u32 index = hash_long(req->cqe.user_data, table->hash_bits); lockdep_assert_held(&req->ctx->uring_lock); hlist_add_head(&req->hash_node, &table->hbs[index].list); } static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; if (req->flags & REQ_F_HASH_LOCKED) { /* * ->cancel_table_locked is protected by ->uring_lock in * contrast to per bucket spinlocks. Likely, tctx_task_work() * already grabbed the mutex for us, but there is a chance it * failed. */ io_tw_lock(ctx, ts); hash_del(&req->hash_node); req->flags &= ~REQ_F_HASH_LOCKED; } else { io_poll_req_delete(req, ctx); } } static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) { poll->head = NULL; #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) /* mask in events that we always want/need */ poll->events = events | IO_POLL_UNMASK; INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, io_poll_wake); } static inline void io_poll_remove_entry(struct io_poll *poll) { struct wait_queue_head *head = smp_load_acquire(&poll->head); if (head) { spin_lock_irq(&head->lock); list_del_init(&poll->wait.entry); poll->head = NULL; spin_unlock_irq(&head->lock); } } static void io_poll_remove_entries(struct io_kiocb *req) { /* * Nothing to do if neither of those flags are set. Avoid dipping * into the poll/apoll/double cachelines if we can. */ if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) return; /* * While we hold the waitqueue lock and the waitqueue is nonempty, * wake_up_pollfree() will wait for us. However, taking the waitqueue * lock in the first place can race with the waitqueue being freed. * * We solve this as eventpoll does: by taking advantage of the fact that * all users of wake_up_pollfree() will RCU-delay the actual free. If * we enter rcu_read_lock() and see that the pointer to the queue is * non-NULL, we can then lock it without the memory being freed out from * under us. * * Keep holding rcu_read_lock() as long as we hold the queue lock, in * case the caller deletes the entry from the queue, leaving it empty. * In that case, only RCU prevents the queue memory from being freed. */ rcu_read_lock(); if (req->flags & REQ_F_SINGLE_POLL) io_poll_remove_entry(io_poll_get_single(req)); if (req->flags & REQ_F_DOUBLE_POLL) io_poll_remove_entry(io_poll_get_double(req)); rcu_read_unlock(); } enum { IOU_POLL_DONE = 0, IOU_POLL_NO_ACTION = 1, IOU_POLL_REMOVE_POLL_USE_RES = 2, IOU_POLL_REISSUE = 3, IOU_POLL_REQUEUE = 4, }; static void __io_poll_execute(struct io_kiocb *req, int mask) { unsigned flags = 0; io_req_set_res(req, mask, 0); req->io_task_work.func = io_poll_task_func; trace_io_uring_task_add(req, mask); if (!(req->flags & REQ_F_POLL_NO_LAZY)) flags = IOU_F_TWQ_LAZY_WAKE; __io_req_task_work_add(req, flags); } static inline void io_poll_execute(struct io_kiocb *req, int res) { if (io_poll_get_ownership(req)) __io_poll_execute(req, res); } /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action * require, which is either spurious wakeup or multishot CQE is served. * IOU_POLL_DONE when it's done with the request, then the mask is stored in * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) { int v; /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) return -ECANCELED; do { v = atomic_read(&req->poll_refs); if (unlikely(v != 1)) { /* tw should be the owner and so have some refs */ if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) return IOU_POLL_NO_ACTION; if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; /* * cqe.res contains only events of the first wake up * and all others are to be lost. Redo vfs_poll() to get * up to date state. */ if ((v & IO_POLL_REF_MASK) != 1) req->cqe.res = 0; if (v & IO_POLL_RETRY_FLAG) { req->cqe.res = 0; /* * We won't find new events that came in between * vfs_poll and the ref put unless we clear the * flag in advance. */ atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); v &= ~IO_POLL_RETRY_FLAG; } } /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; /* * We got woken with a mask, but someone else got to * it first. The above vfs_poll() doesn't add us back * to the waitqueue, so if we get nothing back, we * should be safe and attempt a reissue. */ if (unlikely(!req->cqe.res)) { /* Multishot armed need not reissue */ if (!(req->apoll_events & EPOLLONESHOT)) continue; return IOU_POLL_REISSUE; } } if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; /* multishot, just fill a CQE and proceed */ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } } else { int ret = io_poll_issue(req, ts); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) return IOU_POLL_REQUEUE; if (ret < 0) return ret; } /* force the next iteration to vfs_poll() */ req->cqe.res = 0; /* * Release all references, retry if someone tried to restart * task_work while we were executing it. */ v &= IO_POLL_REF_MASK; } while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); io_napi_add(req); return IOU_POLL_NO_ACTION; } void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) { int ret; ret = io_poll_check_events(req, ts); if (ret == IOU_POLL_NO_ACTION) { return; } else if (ret == IOU_POLL_REQUEUE) { __io_poll_execute(req, 0); return; } io_poll_remove_entries(req); io_poll_tw_hash_eject(req, ts); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { struct io_poll *poll; poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { io_req_task_submit(req, ts); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; req_set_fail(req); } io_req_set_res(req, req->cqe.res, 0); io_req_task_complete(req, ts); } else { io_tw_lock(req->ctx, ts); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) io_req_task_complete(req, ts); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) io_req_task_submit(req, ts); else io_req_defer_failed(req, ret); } } static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); /* kick tw, which should complete the request */ io_poll_execute(req, 0); } #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll) { io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ io_poll_execute(req, 0); /* * If the waitqueue is being freed early but someone is already * holds ownership over it, we have to tear down the request as * best we can. That means immediately removing the request from * its waitqueue and preventing all further accesses to the * waitqueue via the request. */ list_del_init(&poll->wait.entry); /* * Careful: this *must* be the last step, since as soon * as req->head is NULL'ed out, the request can be * completed and freed, since aio_poll_complete_work() * will no longer need to take the waitqueue lock. */ smp_store_release(&poll->head, NULL); return 1; } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wqe_to_req(wait); struct io_poll *poll = container_of(wait, struct io_poll, wait); __poll_t mask = key_to_poll(key); if (unlikely(mask & POLLFREE)) return io_pollfree_wake(req, poll); /* for instances that support it check for an event match first */ if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) return 0; if (io_poll_get_ownership(req)) { /* * If we trigger a multishot poll off our own wakeup path, * disable multishot as there is a circular dependency between * CQ posting and triggering the event. */ if (mask & EPOLL_URING_WAKE) poll->events |= EPOLLONESHOT; /* optional, saves extra locking for removal in tw handler */ if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); poll->head = NULL; if (wqe_is_double(wait)) req->flags &= ~REQ_F_DOUBLE_POLL; else req->flags &= ~REQ_F_SINGLE_POLL; } __io_poll_execute(req, mask); } return 1; } /* fails only when polling is already completing by the first entry */ static bool io_poll_double_prepare(struct io_kiocb *req) { struct wait_queue_head *head; struct io_poll *poll = io_poll_get_single(req); /* head is RCU protected, see io_poll_remove_entries() comments */ rcu_read_lock(); head = smp_load_acquire(&poll->head); /* * poll arm might not hold ownership and so race for req->flags with * io_poll_wake(). There is only one poll entry queued, serialise with * it by taking its head lock. As we're still arming the tw hanlder * is not going to be run, so there are no races with it. */ if (head) { spin_lock_irq(&head->lock); req->flags |= REQ_F_DOUBLE_POLL; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; spin_unlock_irq(&head->lock); } rcu_read_unlock(); return !!head; } static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, struct wait_queue_head *head, struct io_poll **poll_ptr) { struct io_kiocb *req = pt->req; unsigned long wqe_private = (unsigned long) req; /* * The file being polled uses multiple waitqueues for poll handling * (e.g. one for read, one for write). Setup a separate io_poll * if this happens. */ if (unlikely(pt->nr_entries)) { struct io_poll *first = poll; /* double add on the same waitqueue head, ignore */ if (first->head == head) return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { if ((*poll_ptr)->head == head) return; pt->error = -EINVAL; return; } poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; return; } /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; io_init_poll_iocb(poll, first->events); if (!io_poll_double_prepare(req)) { /* the request is completing, just back off */ kfree(poll); return; } *poll_ptr = poll; } else { /* fine to modify, there is no poll queued to race with us */ req->flags |= REQ_F_SINGLE_POLL; } pt->nr_entries++; poll->head = head; poll->wait.private = (void *) wqe_private; if (poll->events & EPOLLEXCLUSIVE) { add_wait_queue_exclusive(head, &poll->wait); } else { add_wait_queue(head, &poll->wait); } } static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); struct io_poll *poll = io_kiocb_to_cmd(pt->req, struct io_poll); __io_queue_proc(poll, pt, head, (struct io_poll **) &pt->req->async_data); } static bool io_poll_can_finish_inline(struct io_kiocb *req, struct io_poll_table *pt) { return pt->owning || io_poll_get_ownership(req); } static void io_poll_add_hash(struct io_kiocb *req) { if (req->flags & REQ_F_HASH_LOCKED) io_poll_req_insert_locked(req); else io_poll_req_insert(req); } /* * Returns 0 when it's handed over for polling. The caller owns the requests if * it returns non-zero, but otherwise should not touch it. Negative values * contain an error code. When the result is >0, the polling has completed * inline and ipt.result_mask is set to the mask. */ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll *poll, struct io_poll_table *ipt, __poll_t mask, unsigned issue_flags) { INIT_HLIST_NODE(&req->hash_node); io_init_poll_iocb(poll, mask); poll->file = req->file; req->apoll_events = poll->events; ipt->pt._key = mask; ipt->req = req; ipt->error = 0; ipt->nr_entries = 0; /* * Polling is either completed here or via task_work, so if we're in the * task context we're naturally serialised with tw by merit of running * the same task. When it's io-wq, take the ownership to prevent tw * from running. However, when we're in the task context, skip taking * it as an optimisation. * * Note: even though the request won't be completed/freed, without * ownership we still can race with io_poll_wake(). * io_poll_can_finish_inline() tries to deal with that. */ ipt->owning = issue_flags & IO_URING_F_UNLOCKED; atomic_set(&req->poll_refs, (int)ipt->owning); /* io-wq doesn't hold uring_lock */ if (issue_flags & IO_URING_F_UNLOCKED) req->flags &= ~REQ_F_HASH_LOCKED; /* * Exclusive waits may only wake a limited amount of entries * rather than all of them, this may interfere with lazy * wake if someone does wait(events > 1). Ensure we don't do * lazy wake for those, as we need to process each one as they * come in. */ if (poll->events & EPOLLEXCLUSIVE) req->flags |= REQ_F_POLL_NO_LAZY; mask = vfs_poll(req->file, &ipt->pt) & poll->events; if (unlikely(ipt->error || !ipt->nr_entries)) { io_poll_remove_entries(req); if (!io_poll_can_finish_inline(req, ipt)) { io_poll_mark_cancelled(req); return 0; } else if (mask && (poll->events & EPOLLET)) { ipt->result_mask = mask; return 1; } return ipt->error ?: -EINVAL; } if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { if (!io_poll_can_finish_inline(req, ipt)) { io_poll_add_hash(req); return 0; } io_poll_remove_entries(req); ipt->result_mask = mask; /* no one else has access to the req, forget about the ref */ return 1; } io_poll_add_hash(req); if (mask && (poll->events & EPOLLET) && io_poll_can_finish_inline(req, ipt)) { __io_poll_execute(req, mask); return 0; } io_napi_add(req); if (ipt->owning) { /* * Try to release ownership. If we see a change of state, e.g. * poll was waken up, queue up a tw, it'll deal with it. */ if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1) __io_poll_execute(req, 0); } return 0; } static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); struct async_poll *apoll = pt->req->apoll; __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } /* * We can't reliably detect loops in repeated poll triggers and issue * subsequently failing. But rather than fail these immediately, allow a * certain amount of retries before we give up. Given that this condition * should _rarely_ trigger even once, we should be fine with a larger value. */ #define APOLL_MAX_RETRY 128 static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { apoll = io_alloc_cache_get(&ctx->apoll_cache); if (!apoll) goto alloc_apoll; apoll->poll.retries = APOLL_MAX_RETRY; } else { alloc_apoll: apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; } apoll->double_poll = NULL; req->apoll = apoll; if (unlikely(!--apoll->poll.retries)) return NULL; return apoll; } int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; /* * apoll requests already grab the mutex to complete in the tw handler, * so removal from the mutex-backed hash is free, use it by default. */ req->flags |= REQ_F_HASH_LOCKED; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; if (def->pollin) { mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ if (req->flags & REQ_F_CLEAR_POLLIN) mask &= ~EPOLLIN; } else { mask |= EPOLLOUT | EPOLLWRNORM; } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; apoll = io_req_alloc_apoll(req, issue_flags); if (!apoll) return IO_APOLL_ABORTED; req->flags &= ~(REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL); req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; io_kbuf_recycle(req, issue_flags); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; trace_io_uring_poll_arm(req, mask, apoll->poll.events); return IO_APOLL_OK; } static __cold bool io_poll_remove_all_table(struct task_struct *tsk, struct io_hash_table *table, bool cancel_all) { unsigned nr_buckets = 1U << table->hash_bits; struct hlist_node *tmp; struct io_kiocb *req; bool found = false; int i; for (i = 0; i < nr_buckets; i++) { struct io_hash_bucket *hb = &table->hbs[i]; spin_lock(&hb->lock); hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { if (io_match_task_safe(req, tsk, cancel_all)) { hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } } spin_unlock(&hb->lock); } return found; } /* * Returns true if we found and killed one or more poll requests */ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all) __must_hold(&ctx->uring_lock) { bool ret; ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all); ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); return ret; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd, struct io_hash_table *table, struct io_hash_bucket **out_bucket) { struct io_kiocb *req; u32 index = hash_long(cd->data, table->hash_bits); struct io_hash_bucket *hb = &table->hbs[index]; *out_bucket = NULL; spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) continue; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { if (io_cancel_match_sequence(req, cd->seq)) continue; } *out_bucket = hb; return req; } spin_unlock(&hb->lock); return NULL; } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd, struct io_hash_table *table, struct io_hash_bucket **out_bucket) { unsigned nr_buckets = 1U << table->hash_bits; struct io_kiocb *req; int i; *out_bucket = NULL; for (i = 0; i < nr_buckets; i++) { struct io_hash_bucket *hb = &table->hbs[i]; spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { if (io_cancel_req_match(req, cd)) { *out_bucket = hb; return req; } } spin_unlock(&hb->lock); } return NULL; } static int io_poll_disarm(struct io_kiocb *req) { if (!req) return -ENOENT; if (!io_poll_get_ownership(req)) return -EALREADY; io_poll_remove_entries(req); hash_del(&req->hash_node); return 0; } static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, struct io_hash_table *table) { struct io_hash_bucket *bucket; struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | IORING_ASYNC_CANCEL_ANY)) req = io_poll_file_find(ctx, cd, table, &bucket); else req = io_poll_find(ctx, false, cd, table, &bucket); if (req) io_poll_cancel_req(req); if (bucket) spin_unlock(&bucket->lock); return req ? 0 : -ENOENT; } int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags) { int ret; ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table); if (ret != -ENOENT) return ret; io_ring_submit_lock(ctx, issue_flags); ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked); io_ring_submit_unlock(ctx, issue_flags); return ret; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, unsigned int flags) { u32 events; events = READ_ONCE(sqe->poll32_events); #ifdef __BIG_ENDIAN events = swahw32(events); #endif if (!(flags & IORING_POLL_ADD_MULTI)) events |= EPOLLONESHOT; if (!(flags & IORING_POLL_ADD_LEVEL)) events |= EPOLLET; return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET)); } int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_update *upd = io_kiocb_to_cmd(req, struct io_poll_update); u32 flags; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | IORING_POLL_ADD_MULTI)) return -EINVAL; /* meaningless without update */ if (flags == IORING_POLL_ADD_MULTI) return -EINVAL; upd->old_user_data = READ_ONCE(sqe->addr); upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; upd->new_user_data = READ_ONCE(sqe->off); if (!upd->update_user_data && upd->new_user_data) return -EINVAL; if (upd->update_events) upd->events = io_poll_parse_events(sqe, flags); else if (sqe->poll32_events) return -EINVAL; return 0; } int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll); u32 flags; if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) return -EINVAL; poll->events = io_poll_parse_events(sqe, flags); return 0; } int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll); struct io_poll_table ipt; int ret; ipt.pt._qproc = io_poll_queue_proc; /* * If sqpoll or single issuer, there is no contention for ->uring_lock * and we'll end up holding it in tw handlers anyway. */ if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER)) req->flags |= REQ_F_HASH_LOCKED; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); return IOU_OK; } return ret ?: IOU_ISSUE_SKIP_COMPLETE; } int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); struct io_ring_ctx *ctx = req->ctx; struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); ret2 = io_poll_disarm(preq); if (bucket) spin_unlock(&bucket->lock); if (!ret2) goto found; if (ret2 != -ENOENT) { ret = ret2; goto out; } preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); ret2 = io_poll_disarm(preq); if (bucket) spin_unlock(&bucket->lock); if (ret2) { ret = ret2; goto out; } found: if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) { ret = -EFAULT; goto out; } if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ if (poll_update->update_events) { struct io_poll *poll = io_kiocb_to_cmd(preq, struct io_poll); poll->events &= ~0xffff; poll->events |= poll_update->events & 0xffff; poll->events |= IO_POLL_UNMASK; } if (poll_update->update_user_data) preq->cqe.user_data = poll_update->new_user_data; ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED); /* successfully updated, don't complete poll request */ if (!ret2 || ret2 == -EIOCBQUEUED) goto out; } req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); preq->io_task_work.func = io_req_task_complete; io_req_task_work_add(preq); out: io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { req_set_fail(req); return ret; } /* complete update request, we're done with it */ io_req_set_res(req, ret, 0); return IOU_OK; }
2 2 2 2 2 2 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 // SPDX-License-Identifier: GPL-2.0-only /* * Line 6 Linux USB driver * * Copyright (C) 2004-2010 Markus Grabner (grabner@icg.tugraz.at) */ #include <linux/slab.h> #include <linux/wait.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/usb.h> #include <sound/core.h> #include <sound/control.h> #include "capture.h" #include "driver.h" #include "playback.h" /* Locate name in binary program dump */ #define POD_NAME_OFFSET 0 #define POD_NAME_LENGTH 16 /* Other constants */ #define POD_CONTROL_SIZE 0x80 #define POD_BUFSIZE_DUMPREQ 7 #define POD_STARTUP_DELAY 1000 /* Stages of POD startup procedure */ enum { POD_STARTUP_VERSIONREQ, POD_STARTUP_SETUP, POD_STARTUP_DONE, }; enum { LINE6_BASSPODXT, LINE6_BASSPODXTLIVE, LINE6_BASSPODXTPRO, LINE6_POCKETPOD, LINE6_PODXT, LINE6_PODXTLIVE_POD, LINE6_PODXTPRO, }; struct usb_line6_pod { /* Generic Line 6 USB data */ struct usb_line6 line6; /* Instrument monitor level */ int monitor_level; /* Current progress in startup procedure */ int startup_progress; /* Serial number of device */ u32 serial_number; /* Firmware version (x 100) */ int firmware_version; /* Device ID */ int device_id; }; #define line6_to_pod(x) container_of(x, struct usb_line6_pod, line6) #define POD_SYSEX_CODE 3 /* *INDENT-OFF* */ enum { POD_SYSEX_SAVE = 0x24, POD_SYSEX_SYSTEM = 0x56, POD_SYSEX_SYSTEMREQ = 0x57, /* POD_SYSEX_UPDATE = 0x6c, */ /* software update! */ POD_SYSEX_STORE = 0x71, POD_SYSEX_FINISH = 0x72, POD_SYSEX_DUMPMEM = 0x73, POD_SYSEX_DUMP = 0x74, POD_SYSEX_DUMPREQ = 0x75 /* dumps entire internal memory of PODxt Pro */ /* POD_SYSEX_DUMPMEM2 = 0x76 */ }; enum { POD_MONITOR_LEVEL = 0x04, POD_SYSTEM_INVALID = 0x10000 }; /* *INDENT-ON* */ enum { POD_DUMP_MEMORY = 2 }; enum { POD_BUSY_READ, POD_BUSY_WRITE, POD_CHANNEL_DIRTY, POD_SAVE_PRESSED, POD_BUSY_MIDISEND }; static const struct snd_ratden pod_ratden = { .num_min = 78125, .num_max = 78125, .num_step = 1, .den = 2 }; static struct line6_pcm_properties pod_pcm_properties = { .playback_hw = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_PAUSE | SNDRV_PCM_INFO_SYNC_START), .formats = SNDRV_PCM_FMTBIT_S24_3LE, .rates = SNDRV_PCM_RATE_KNOT, .rate_min = 39062, .rate_max = 39063, .channels_min = 2, .channels_max = 2, .buffer_bytes_max = 60000, .period_bytes_min = 64, .period_bytes_max = 8192, .periods_min = 1, .periods_max = 1024}, .capture_hw = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_SYNC_START), .formats = SNDRV_PCM_FMTBIT_S24_3LE, .rates = SNDRV_PCM_RATE_KNOT, .rate_min = 39062, .rate_max = 39063, .channels_min = 2, .channels_max = 2, .buffer_bytes_max = 60000, .period_bytes_min = 64, .period_bytes_max = 8192, .periods_min = 1, .periods_max = 1024}, .rates = { .nrats = 1, .rats = &pod_ratden}, .bytes_per_channel = 3 /* SNDRV_PCM_FMTBIT_S24_3LE */ }; static const char pod_version_header[] = { 0xf0, 0x7e, 0x7f, 0x06, 0x02 }; static char *pod_alloc_sysex_buffer(struct usb_line6_pod *pod, int code, int size) { return line6_alloc_sysex_buffer(&pod->line6, POD_SYSEX_CODE, code, size); } /* Process a completely received message. */ static void line6_pod_process_message(struct usb_line6 *line6) { struct usb_line6_pod *pod = line6_to_pod(line6); const unsigned char *buf = pod->line6.buffer_message; if (memcmp(buf, pod_version_header, sizeof(pod_version_header)) == 0) { pod->firmware_version = buf[13] * 100 + buf[14] * 10 + buf[15]; pod->device_id = ((int)buf[8] << 16) | ((int)buf[9] << 8) | (int) buf[10]; if (pod->startup_progress == POD_STARTUP_VERSIONREQ) { pod->startup_progress = POD_STARTUP_SETUP; schedule_delayed_work(&line6->startup_work, 0); } return; } /* Only look for sysex messages from this device */ if (buf[0] != (LINE6_SYSEX_BEGIN | LINE6_CHANNEL_DEVICE) && buf[0] != (LINE6_SYSEX_BEGIN | LINE6_CHANNEL_UNKNOWN)) { return; } if (memcmp(buf + 1, line6_midi_id, sizeof(line6_midi_id)) != 0) return; if (buf[5] == POD_SYSEX_SYSTEM && buf[6] == POD_MONITOR_LEVEL) { short value = ((int)buf[7] << 12) | ((int)buf[8] << 8) | ((int)buf[9] << 4) | (int)buf[10]; pod->monitor_level = value; } } /* Send system parameter (from integer). */ static int pod_set_system_param_int(struct usb_line6_pod *pod, int value, int code) { char *sysex; static const int size = 5; sysex = pod_alloc_sysex_buffer(pod, POD_SYSEX_SYSTEM, size); if (!sysex) return -ENOMEM; sysex[SYSEX_DATA_OFS] = code; sysex[SYSEX_DATA_OFS + 1] = (value >> 12) & 0x0f; sysex[SYSEX_DATA_OFS + 2] = (value >> 8) & 0x0f; sysex[SYSEX_DATA_OFS + 3] = (value >> 4) & 0x0f; sysex[SYSEX_DATA_OFS + 4] = (value) & 0x0f; line6_send_sysex_message(&pod->line6, sysex, size); kfree(sysex); return 0; } /* "read" request on "serial_number" special file. */ static ssize_t serial_number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%u\n", pod->serial_number); } /* "read" request on "firmware_version" special file. */ static ssize_t firmware_version_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%d.%02d\n", pod->firmware_version / 100, pod->firmware_version % 100); } /* "read" request on "device_id" special file. */ static ssize_t device_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%d\n", pod->device_id); } /* POD startup procedure. This is a sequence of functions with special requirements (e.g., must not run immediately after initialization, must not run in interrupt context). After the last one has finished, the device is ready to use. */ static void pod_startup(struct usb_line6 *line6) { struct usb_line6_pod *pod = line6_to_pod(line6); switch (pod->startup_progress) { case POD_STARTUP_VERSIONREQ: /* request firmware version: */ line6_version_request_async(line6); break; case POD_STARTUP_SETUP: /* serial number: */ line6_read_serial_number(&pod->line6, &pod->serial_number); /* ALSA audio interface: */ if (snd_card_register(line6->card)) dev_err(line6->ifcdev, "Failed to register POD card.\n"); pod->startup_progress = POD_STARTUP_DONE; break; default: break; } } /* POD special files: */ static DEVICE_ATTR_RO(device_id); static DEVICE_ATTR_RO(firmware_version); static DEVICE_ATTR_RO(serial_number); static struct attribute *pod_dev_attrs[] = { &dev_attr_device_id.attr, &dev_attr_firmware_version.attr, &dev_attr_serial_number.attr, NULL }; static const struct attribute_group pod_dev_attr_group = { .name = "pod", .attrs = pod_dev_attrs, }; /* control info callback */ static int snd_pod_control_monitor_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = 65535; return 0; } /* control get callback */ static int snd_pod_control_monitor_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_line6_pcm *line6pcm = snd_kcontrol_chip(kcontrol); struct usb_line6_pod *pod = line6_to_pod(line6pcm->line6); ucontrol->value.integer.value[0] = pod->monitor_level; return 0; } /* control put callback */ static int snd_pod_control_monitor_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_line6_pcm *line6pcm = snd_kcontrol_chip(kcontrol); struct usb_line6_pod *pod = line6_to_pod(line6pcm->line6); if (ucontrol->value.integer.value[0] == pod->monitor_level) return 0; pod->monitor_level = ucontrol->value.integer.value[0]; pod_set_system_param_int(pod, ucontrol->value.integer.value[0], POD_MONITOR_LEVEL); return 1; } /* control definition */ static const struct snd_kcontrol_new pod_control_monitor = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Monitor Playback Volume", .index = 0, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .info = snd_pod_control_monitor_info, .get = snd_pod_control_monitor_get, .put = snd_pod_control_monitor_put }; /* Try to init POD device. */ static int pod_init(struct usb_line6 *line6, const struct usb_device_id *id) { int err; struct usb_line6_pod *pod = line6_to_pod(line6); line6->process_message = line6_pod_process_message; line6->startup = pod_startup; /* create sysfs entries: */ err = snd_card_add_dev_attr(line6->card, &pod_dev_attr_group); if (err < 0) return err; /* initialize PCM subsystem: */ err = line6_init_pcm(line6, &pod_pcm_properties); if (err < 0) return err; /* register monitor control: */ err = snd_ctl_add(line6->card, snd_ctl_new1(&pod_control_monitor, line6->line6pcm)); if (err < 0) return err; /* When the sound card is registered at this point, the PODxt Live displays "Invalid Code Error 07", so we do it later in the event handler. */ if (pod->line6.properties->capabilities & LINE6_CAP_CONTROL) { pod->monitor_level = POD_SYSTEM_INVALID; /* initiate startup procedure: */ schedule_delayed_work(&line6->startup_work, msecs_to_jiffies(POD_STARTUP_DELAY)); } return 0; } #define LINE6_DEVICE(prod) USB_DEVICE(0x0e41, prod) #define LINE6_IF_NUM(prod, n) USB_DEVICE_INTERFACE_NUMBER(0x0e41, prod, n) /* table of devices that work with this driver */ static const struct usb_device_id pod_id_table[] = { { LINE6_DEVICE(0x4250), .driver_info = LINE6_BASSPODXT }, { LINE6_DEVICE(0x4642), .driver_info = LINE6_BASSPODXTLIVE }, { LINE6_DEVICE(0x4252), .driver_info = LINE6_BASSPODXTPRO }, { LINE6_IF_NUM(0x5051, 1), .driver_info = LINE6_POCKETPOD }, { LINE6_DEVICE(0x5044), .driver_info = LINE6_PODXT }, { LINE6_IF_NUM(0x4650, 0), .driver_info = LINE6_PODXTLIVE_POD }, { LINE6_DEVICE(0x5050), .driver_info = LINE6_PODXTPRO }, {} }; MODULE_DEVICE_TABLE(usb, pod_id_table); static const struct line6_properties pod_properties_table[] = { [LINE6_BASSPODXT] = { .id = "BassPODxt", .name = "BassPODxt", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_BASSPODXTLIVE] = { .id = "BassPODxtLive", .name = "BassPODxt Live", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 1, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_BASSPODXTPRO] = { .id = "BassPODxtPro", .name = "BassPODxt Pro", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_POCKETPOD] = { .id = "PocketPOD", .name = "Pocket POD", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI, .altsetting = 0, .ep_ctrl_r = 0x82, .ep_ctrl_w = 0x02, /* no audio channel */ }, [LINE6_PODXT] = { .id = "PODxt", .name = "PODxt", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_PODXTLIVE_POD] = { .id = "PODxtLive", .name = "PODxt Live", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 1, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_PODXTPRO] = { .id = "PODxtPro", .name = "PODxt Pro", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, }; /* Probe USB device. */ static int pod_probe(struct usb_interface *interface, const struct usb_device_id *id) { return line6_probe(interface, id, "Line6-POD", &pod_properties_table[id->driver_info], pod_init, sizeof(struct usb_line6_pod)); } static struct usb_driver pod_driver = { .name = KBUILD_MODNAME, .probe = pod_probe, .disconnect = line6_disconnect, #ifdef CONFIG_PM .suspend = line6_suspend, .resume = line6_resume, .reset_resume = line6_resume, #endif .id_table = pod_id_table, }; module_usb_driver(pod_driver); MODULE_DESCRIPTION("Line 6 POD USB driver"); MODULE_LICENSE("GPL");
4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory_hotplug.c * * Copyright (C) */ #include <linux/stddef.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/swap.h> #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/memory.h> #include <linux/memremap.h> #include <linux/memory_hotplug.h> #include <linux/vmalloc.h> #include <linux/ioport.h> #include <linux/delay.h> #include <linux/migrate.h> #include <linux/page-isolation.h> #include <linux/pfn.h> #include <linux/suspend.h> #include <linux/mm_inline.h> #include <linux/firmware-map.h> #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> #include <linux/compaction.h> #include <linux/rmap.h> #include <linux/module.h> #include <asm/tlbflush.h> #include "internal.h" #include "shuffle.h" enum { MEMMAP_ON_MEMORY_DISABLE = 0, MEMMAP_ON_MEMORY_ENABLE, MEMMAP_ON_MEMORY_FORCE, }; static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; static inline unsigned long memory_block_memmap_size(void) { return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); } static inline unsigned long memory_block_memmap_on_memory_pages(void) { unsigned long nr_pages = PFN_UP(memory_block_memmap_size()); /* * In "forced" memmap_on_memory mode, we add extra pages to align the * vmemmap size to cover full pageblocks. That way, we can add memory * even if the vmemmap size is not properly aligned, however, we might waste * memory. */ if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) return pageblock_align(nr_pages); return nr_pages; } #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY /* * memory_hotplug.memmap_on_memory parameter */ static int set_memmap_mode(const char *val, const struct kernel_param *kp) { int ret, mode; bool enabled; if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { mode = MEMMAP_ON_MEMORY_FORCE; } else { ret = kstrtobool(val, &enabled); if (ret < 0) return ret; if (enabled) mode = MEMMAP_ON_MEMORY_ENABLE; else mode = MEMMAP_ON_MEMORY_DISABLE; } *((int *)kp->arg) = mode; if (mode == MEMMAP_ON_MEMORY_FORCE) { unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); pr_info_once("Memory hotplug will waste %ld pages in each memory block\n", memmap_pages - PFN_UP(memory_block_memmap_size())); } return 0; } static int get_memmap_mode(char *buffer, const struct kernel_param *kp) { int mode = *((int *)kp->arg); if (mode == MEMMAP_ON_MEMORY_FORCE) return sprintf(buffer, "force\n"); return sprintf(buffer, "%c\n", mode ? 'Y' : 'N'); } static const struct kernel_param_ops memmap_mode_ops = { .set = set_memmap_mode, .get = get_memmap_mode, }; module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" "With value \"force\" it could result in memory wastage due " "to memmap size limitations (Y/N/force)"); static inline bool mhp_memmap_on_memory(void) { return memmap_mode != MEMMAP_ON_MEMORY_DISABLE; } #else static inline bool mhp_memmap_on_memory(void) { return false; } #endif enum { ONLINE_POLICY_CONTIG_ZONES = 0, ONLINE_POLICY_AUTO_MOVABLE, }; static const char * const online_policy_to_str[] = { [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", }; static int set_online_policy(const char *val, const struct kernel_param *kp) { int ret = sysfs_match_string(online_policy_to_str, val); if (ret < 0) return ret; *((int *)kp->arg) = ret; return 0; } static int get_online_policy(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]); } /* * memory_hotplug.online_policy: configure online behavior when onlining without * specifying a zone (MMOP_ONLINE) * * "contig-zones": keep zone contiguous * "auto-movable": online memory to ZONE_MOVABLE if the configuration * (auto_movable_ratio, auto_movable_numa_aware) allows for it */ static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; static const struct kernel_param_ops online_policy_ops = { .set = set_online_policy, .get = get_online_policy, }; module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644); MODULE_PARM_DESC(online_policy, "Set the online policy (\"contig-zones\", \"auto-movable\") " "Default: \"contig-zones\""); /* * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio * * The ratio represent an upper limit and the kernel might decide to not * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory * doesn't allow for more MOVABLE memory. */ static unsigned int auto_movable_ratio __read_mostly = 301; module_param(auto_movable_ratio, uint, 0644); MODULE_PARM_DESC(auto_movable_ratio, "Set the maximum ratio of MOVABLE:KERNEL memory in the system " "in percent for \"auto-movable\" online policy. Default: 301"); /* * memory_hotplug.auto_movable_numa_aware: consider numa node stats */ #ifdef CONFIG_NUMA static bool auto_movable_numa_aware __read_mostly = true; module_param(auto_movable_numa_aware, bool, 0644); MODULE_PARM_DESC(auto_movable_numa_aware, "Consider numa node stats in addition to global stats in " "\"auto-movable\" online policy. Default: true"); #endif /* CONFIG_NUMA */ /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be * changed by calling set_online_page_callback() for callback registration * and restore_online_page_callback() for generic callback restore. */ static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock); void get_online_mems(void) { percpu_down_read(&mem_hotplug_lock); } void put_online_mems(void) { percpu_up_read(&mem_hotplug_lock); } bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE int mhp_default_online_type = MMOP_OFFLINE; #else int mhp_default_online_type = MMOP_ONLINE; #endif static int __init setup_memhp_default_state(char *str) { const int online_type = mhp_online_type_from_str(str); if (online_type >= 0) mhp_default_online_type = online_type; return 1; } __setup("memhp_default_state=", setup_memhp_default_state); void mem_hotplug_begin(void) { cpus_read_lock(); percpu_down_write(&mem_hotplug_lock); } void mem_hotplug_done(void) { percpu_up_write(&mem_hotplug_lock); cpus_read_unlock(); } u64 max_mem_size = U64_MAX; /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size, const char *resource_name) { struct resource *res; unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; if (strcmp(resource_name, "System RAM")) flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; if (!mhp_range_allowed(start, size, true)) return ERR_PTR(-E2BIG); /* * Make sure value parsed from 'mem=' only restricts memory adding * while booting, so that memory hotplug won't be impacted. Please * refer to document of 'mem=' in kernel-parameters.txt for more * details. */ if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) return ERR_PTR(-E2BIG); /* * Request ownership of the new memory range. This might be * a child of an existing resource that was present but * not marked as busy. */ res = __request_region(&iomem_resource, start, size, resource_name, flags); if (!res) { pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", start, start + size); return ERR_PTR(-EEXIST); } return res; } static void release_memory_resource(struct resource *res) { if (!res) return; release_resource(res); kfree(res); } static int check_pfn_span(unsigned long pfn, unsigned long nr_pages) { /* * Disallow all operations smaller than a sub-section and only * allow operations smaller than a section for * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() * enforces a larger memory_block_size_bytes() granularity for * memory that will be marked online, so this check should only * fire for direct arch_{add,remove}_memory() users outside of * add_memory_resource(). */ unsigned long min_align; if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) min_align = PAGES_PER_SUBSECTION; else min_align = PAGES_PER_SECTION; if (!IS_ALIGNED(pfn | nr_pages, min_align)) return -EINVAL; return 0; } /* * Return page for the valid pfn only if the page is online. All pfn * walkers which rely on the fully initialized page->flags and others * should use this rather than pfn_valid && pfn_to_page */ struct page *pfn_to_online_page(unsigned long pfn) { unsigned long nr = pfn_to_section_nr(pfn); struct dev_pagemap *pgmap; struct mem_section *ms; if (nr >= NR_MEM_SECTIONS) return NULL; ms = __nr_to_section(nr); if (!online_section(ms)) return NULL; /* * Save some code text when online_section() + * pfn_section_valid() are sufficient. */ if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) return NULL; if (!pfn_section_valid(ms, pfn)) return NULL; if (!online_device_section(ms)) return pfn_to_page(pfn); /* * Slowpath: when ZONE_DEVICE collides with * ZONE_{NORMAL,MOVABLE} within the same section some pfns in * the section may be 'offline' but 'valid'. Only * get_dev_pagemap() can determine sub-section online status. */ pgmap = get_dev_pagemap(pfn, NULL); put_dev_pagemap(pgmap); /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ if (pgmap) return NULL; return pfn_to_page(pfn); } EXPORT_SYMBOL_GPL(pfn_to_online_page); int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { const unsigned long end_pfn = pfn + nr_pages; unsigned long cur_nr_pages; int err; struct vmem_altmap *altmap = params->altmap; if (WARN_ON_ONCE(!pgprot_val(params->pgprot))) return -EINVAL; VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); if (altmap) { /* * Validate altmap is within bounds of the total request */ if (altmap->base_pfn != pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); return -EINVAL; } altmap->alloc = 0; } if (check_pfn_span(pfn, nr_pages)) { WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); return -EINVAL; } for (; pfn < end_pfn; pfn += cur_nr_pages) { /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); err = sparse_add_section(nid, pfn, cur_nr_pages, altmap, params->pgmap); if (err) break; cond_resched(); } vmemmap_populate_print_last(); return err; } /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { if (unlikely(!pfn_to_online_page(start_pfn))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) continue; if (zone != page_zone(pfn_to_page(start_pfn))) continue; return start_pfn; } return 0; } /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { if (unlikely(!pfn_to_online_page(pfn))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) continue; if (zone != page_zone(pfn_to_page(pfn))) continue; return pfn; } return 0; } static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; int nid = zone_to_nid(zone); if (zone->zone_start_pfn == start_pfn) { /* * If the section is smallest section in the zone, it need * shrink zone->zone_start_pfn and zone->zone_spanned_pages. * In this case, we find second smallest valid mem_section * for shrinking zone. */ pfn = find_smallest_section_pfn(nid, zone, end_pfn, zone_end_pfn(zone)); if (pfn) { zone->spanned_pages = zone_end_pfn(zone) - pfn; zone->zone_start_pfn = pfn; } else { zone->zone_start_pfn = 0; zone->spanned_pages = 0; } } else if (zone_end_pfn(zone) == end_pfn) { /* * If the section is biggest section in the zone, it need * shrink zone->spanned_pages. * In this case, we find second biggest valid mem_section for * shrinking zone. */ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, start_pfn); if (pfn) zone->spanned_pages = pfn - zone->zone_start_pfn + 1; else { zone->zone_start_pfn = 0; zone->spanned_pages = 0; } } } static void update_pgdat_span(struct pglist_data *pgdat) { unsigned long node_start_pfn = 0, node_end_pfn = 0; struct zone *zone; for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { unsigned long end_pfn = zone_end_pfn(zone); /* No need to lock the zones, they can't change. */ if (!zone->spanned_pages) continue; if (!node_end_pfn) { node_start_pfn = zone->zone_start_pfn; node_end_pfn = end_pfn; continue; } if (end_pfn > node_end_pfn) node_end_pfn = end_pfn; if (zone->zone_start_pfn < node_start_pfn) node_start_pfn = zone->zone_start_pfn; } pgdat->node_start_pfn = node_start_pfn; pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } void __ref remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { const unsigned long end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; unsigned long pfn, cur_nr_pages; /* Poison struct pages because they are now uninitialized again. */ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { cond_resched(); /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); page_init_poison(pfn_to_page(pfn), sizeof(struct page) * cur_nr_pages); } /* * Zone shrinking code cannot properly deal with ZONE_DEVICE. So * we will not try to shrink the zones - which is okay as * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. */ if (zone_is_zone_device(zone)) return; clear_zone_contiguous(zone); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); update_pgdat_span(pgdat); set_zone_contiguous(zone); } /** * __remove_pages() - remove sections of pages * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ void __remove_pages(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { const unsigned long end_pfn = pfn + nr_pages; unsigned long cur_nr_pages; if (check_pfn_span(pfn, nr_pages)) { WARN(1, "Misaligned %s start: %#lx end: %#lx\n", __func__, pfn, pfn + nr_pages - 1); return; } for (; pfn < end_pfn; pfn += cur_nr_pages) { cond_resched(); /* Select all remaining pages up to the next section boundary */ cur_nr_pages = min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); sparse_remove_section(pfn, cur_nr_pages, altmap); } } int set_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; get_online_mems(); mutex_lock(&online_page_callback_lock); if (online_page_callback == generic_online_page) { online_page_callback = callback; rc = 0; } mutex_unlock(&online_page_callback_lock); put_online_mems(); return rc; } EXPORT_SYMBOL_GPL(set_online_page_callback); int restore_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; get_online_mems(); mutex_lock(&online_page_callback_lock); if (online_page_callback == callback) { online_page_callback = generic_online_page; rc = 0; } mutex_unlock(&online_page_callback_lock); put_online_mems(); return rc; } EXPORT_SYMBOL_GPL(restore_online_page_callback); /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ void __ref generic_online_page(struct page *page, unsigned int order) { __free_pages_core(page, order, MEMINIT_HOTPLUG); } EXPORT_SYMBOL_GPL(generic_online_page); static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn; /* * Online the pages in MAX_PAGE_ORDER aligned chunks. The callback might * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). * When using memmap_on_memory, the range might not be aligned to * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect * this and the first chunk to online will be pageblock_nr_pages. */ for (pfn = start_pfn; pfn < end_pfn;) { int order; /* * Free to online pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is * MAX_PAGE_ORDER-aligned, Set order to MAX_PAGE_ORDER for * the case. */ if (pfn) order = min_t(int, MAX_PAGE_ORDER, __ffs(pfn)); else order = MAX_PAGE_ORDER; (*online_page_callback)(pfn_to_page(pfn), order); pfn += (1UL << order); } /* mark all involved sections as online */ online_mem_sections(start_pfn, end_pfn); } /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { int nid = zone_to_nid(zone); arg->status_change_nid = NUMA_NO_NODE; arg->status_change_nid_normal = NUMA_NO_NODE; if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; } static void node_states_set_node(int node, struct memory_notify *arg) { if (arg->status_change_nid_normal >= 0) node_set_state(node, N_NORMAL_MEMORY); if (arg->status_change_nid >= 0) node_set_state(node, N_MEMORY); } static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { unsigned long old_end_pfn = zone_end_pfn(zone); if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; } static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long nr_pages) { unsigned long old_end_pfn = pgdat_end_pfn(pgdat); if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; } #ifdef CONFIG_ZONE_DEVICE static void section_taint_zone_device(unsigned long pfn) { struct mem_section *ms = __pfn_to_section(pfn); ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; } #else static inline void section_taint_zone_device(unsigned long pfn) { } #endif /* * Associate the pfn range with the given zone, initializing the memmaps * and resizing the pgdat/zone data to span the added pages. After this * call, all affected pages are PageOffline(). * * All aligned pageblocks are initialized to the specified migratetype * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, int migratetype) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; clear_zone_contiguous(zone); if (zone_is_empty(zone)) init_currently_empty_zone(zone, start_pfn, nr_pages); resize_zone_range(zone, start_pfn, nr_pages); resize_pgdat_range(pgdat, start_pfn, nr_pages); /* * Subsection population requires care in pfn_to_online_page(). * Set the taint to enable the slow path detection of * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} * section. */ if (zone_is_zone_device(zone)) { if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION)) section_taint_zone_device(start_pfn); if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)) section_taint_zone_device(start_pfn + nr_pages); } /* * TODO now we have a visible range of pages which are not associated * with their zone properly. Not nice but set_pfnblock_flags_mask * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); } struct auto_movable_stats { unsigned long kernel_early_pages; unsigned long movable_pages; }; static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, struct zone *zone) { if (zone_idx(zone) == ZONE_MOVABLE) { stats->movable_pages += zone->present_pages; } else { stats->kernel_early_pages += zone->present_early_pages; #ifdef CONFIG_CMA /* * CMA pages (never on hotplugged memory) behave like * ZONE_MOVABLE. */ stats->movable_pages += zone->cma_pages; stats->kernel_early_pages -= zone->cma_pages; #endif /* CONFIG_CMA */ } } struct auto_movable_group_stats { unsigned long movable_pages; unsigned long req_kernel_early_pages; }; static int auto_movable_stats_account_group(struct memory_group *group, void *arg) { const int ratio = READ_ONCE(auto_movable_ratio); struct auto_movable_group_stats *stats = arg; long pages; /* * We don't support modifying the config while the auto-movable online * policy is already enabled. Just avoid the division by zero below. */ if (!ratio) return 0; /* * Calculate how many early kernel pages this group requires to * satisfy the configured zone ratio. */ pages = group->present_movable_pages * 100 / ratio; pages -= group->present_kernel_pages; if (pages > 0) stats->req_kernel_early_pages += pages; stats->movable_pages += group->present_movable_pages; return 0; } static bool auto_movable_can_online_movable(int nid, struct memory_group *group, unsigned long nr_pages) { unsigned long kernel_early_pages, movable_pages; struct auto_movable_group_stats group_stats = {}; struct auto_movable_stats stats = {}; struct zone *zone; int i; /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ if (nid == NUMA_NO_NODE) { /* TODO: cache values */ for_each_populated_zone(zone) auto_movable_stats_account_zone(&stats, zone); } else { for (i = 0; i < MAX_NR_ZONES; i++) { pg_data_t *pgdat = NODE_DATA(nid); zone = pgdat->node_zones + i; if (populated_zone(zone)) auto_movable_stats_account_zone(&stats, zone); } } kernel_early_pages = stats.kernel_early_pages; movable_pages = stats.movable_pages; /* * Kernel memory inside dynamic memory group allows for more MOVABLE * memory within the same group. Remove the effect of all but the * current group from the stats. */ walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, group, &group_stats); if (kernel_early_pages <= group_stats.req_kernel_early_pages) return false; kernel_early_pages -= group_stats.req_kernel_early_pages; movable_pages -= group_stats.movable_pages; if (group && group->is_dynamic) kernel_early_pages += group->present_kernel_pages; /* * Test if we could online the given number of pages to ZONE_MOVABLE * and still stay in the configured ratio. */ movable_pages += nr_pages; return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100; } /* * Returns a default kernel memory zone for the given pfn range. * If no kernel zone covers this pfn range it will automatically go * to the ZONE_NORMAL. */ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = NODE_DATA(nid); int zid; for (zid = 0; zid < ZONE_NORMAL; zid++) { struct zone *zone = &pgdat->node_zones[zid]; if (zone_intersects(zone, start_pfn, nr_pages)) return zone; } return &pgdat->node_zones[ZONE_NORMAL]; } /* * Determine to which zone to online memory dynamically based on user * configuration and system stats. We care about the following ratio: * * MOVABLE : KERNEL * * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in * one of the kernel zones. CMA pages inside one of the kernel zones really * behaves like ZONE_MOVABLE, so we treat them accordingly. * * We don't allow for hotplugged memory in a KERNEL zone to increase the * amount of MOVABLE memory we can have, so we end up with: * * MOVABLE : KERNEL_EARLY * * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze * boot. We base our calculation on KERNEL_EARLY internally, because: * * a) Hotplugged memory in one of the kernel zones can sometimes still get * hotunplugged, especially when hot(un)plugging individual memory blocks. * There is no coordination across memory devices, therefore "automatic" * hotunplugging, as implemented in hypervisors, could result in zone * imbalances. * b) Early/boot memory in one of the kernel zones can usually not get * hotunplugged again (e.g., no firmware interface to unplug, fragmented * with unmovable allocations). While there are corner cases where it might * still work, it is barely relevant in practice. * * Exceptions are dynamic memory groups, which allow for more MOVABLE * memory within the same memory group -- because in that case, there is * coordination within the single memory device managed by a single driver. * * We rely on "present pages" instead of "managed pages", as the latter is * highly unreliable and dynamic in virtualized environments, and does not * consider boot time allocations. For example, memory ballooning adjusts the * managed pages when inflating/deflating the balloon, and balloon compaction * can even migrate inflated pages between zones. * * Using "present pages" is better but some things to keep in mind are: * * a) Some memblock allocations, such as for the crashkernel area, are * effectively unused by the kernel, yet they account to "present pages". * Fortunately, these allocations are comparatively small in relevant setups * (e.g., fraction of system memory). * b) Some hotplugged memory blocks in virtualized environments, esecially * hotplugged by virtio-mem, look like they are completely present, however, * only parts of the memory block are actually currently usable. * "present pages" is an upper limit that can get reached at runtime. As * we base our calculations on KERNEL_EARLY, this is not an issue. */ static struct zone *auto_movable_zone_for_pfn(int nid, struct memory_group *group, unsigned long pfn, unsigned long nr_pages) { unsigned long online_pages = 0, max_pages, end_pfn; struct page *page; if (!auto_movable_ratio) goto kernel_zone; if (group && !group->is_dynamic) { max_pages = group->s.max_pages; online_pages = group->present_movable_pages; /* If anything is !MOVABLE online the rest !MOVABLE. */ if (group->present_kernel_pages) goto kernel_zone; } else if (!group || group->d.unit_pages == nr_pages) { max_pages = nr_pages; } else { max_pages = group->d.unit_pages; /* * Take a look at all online sections in the current unit. * We can safely assume that all pages within a section belong * to the same zone, because dynamic memory groups only deal * with hotplugged memory. */ pfn = ALIGN_DOWN(pfn, group->d.unit_pages); end_pfn = pfn + group->d.unit_pages; for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { page = pfn_to_online_page(pfn); if (!page) continue; /* If anything is !MOVABLE online the rest !MOVABLE. */ if (!is_zone_movable_page(page)) goto kernel_zone; online_pages += PAGES_PER_SECTION; } } /* * Online MOVABLE if we could *currently* online all remaining parts * MOVABLE. We expect to (add+) online them immediately next, so if * nobody interferes, all will be MOVABLE if possible. */ nr_pages = max_pages - online_pages; if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) goto kernel_zone; #ifdef CONFIG_NUMA if (auto_movable_numa_aware && !auto_movable_can_online_movable(nid, group, nr_pages)) goto kernel_zone; #endif /* CONFIG_NUMA */ return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; kernel_zone: return default_kernel_zone_for_pfn(nid, pfn, nr_pages); } static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); /* * We inherit the existing zone in a simple case where zones do not * overlap in the given range */ if (in_kernel ^ in_movable) return (in_kernel) ? kernel_zone : movable_zone; /* * If the range doesn't belong to any zone or two zones overlap in the * given range then we use movable zone only if movable_node is * enabled because we always online to a kernel zone by default. */ return movable_node_enabled ? movable_zone : kernel_zone; } struct zone *zone_for_pfn_range(int online_type, int nid, struct memory_group *group, unsigned long start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); if (online_type == MMOP_ONLINE_MOVABLE) return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); return default_zone_for_pfn(nid, start_pfn, nr_pages); } /* * This function should only be called by memory_block_{online,offline}, * and {online,offline}_pages. */ void adjust_present_page_count(struct page *page, struct memory_group *group, long nr_pages) { struct zone *zone = page_zone(page); const bool movable = zone_idx(zone) == ZONE_MOVABLE; /* * We only support onlining/offlining/adding/removing of complete * memory blocks; therefore, either all is either early or hotplugged. */ if (early_section(__pfn_to_section(page_to_pfn(page)))) zone->present_early_pages += nr_pages; zone->present_pages += nr_pages; zone->zone_pgdat->node_present_pages += nr_pages; if (group && movable) group->present_movable_pages += nr_pages; else if (group && !movable) group->present_kernel_pages += nr_pages; } int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone, bool mhp_off_inaccessible) { unsigned long end_pfn = pfn + nr_pages; int ret, i; ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); if (ret) return ret; /* * Memory block is accessible at this stage and hence poison the struct * pages now. If the memory block is accessible during memory hotplug * addition phase, then page poisining is already performed in * sparse_add_section(). */ if (mhp_off_inaccessible) page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn + i); __ClearPageOffline(page); SetPageVmemmapSelfHosted(page); } /* * It might be that the vmemmap_pages fully span sections. If that is * the case, mark those sections online here as otherwise they will be * left offline. */ if (nr_pages >= PAGES_PER_SECTION) online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); return ret; } void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) { unsigned long end_pfn = pfn + nr_pages; /* * It might be that the vmemmap_pages fully span sections. If that is * the case, mark those sections offline here as otherwise they will be * left online. */ if (nr_pages >= PAGES_PER_SECTION) offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); /* * The pages associated with this vmemmap have been offlined, so * we can reset its state here. */ remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } /* * Must be called with mem_hotplug_lock in write mode. */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { unsigned long flags; int need_zonelists_rebuild = 0; const int nid = zone_to_nid(zone); int ret; struct memory_notify arg; /* * {on,off}lining is constrained to full memory sections (or more * precisely to memory blocks from the user space POV). * memmap_on_memory is an exception because it reserves initial part * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; /* associate pfn range with the zone */ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) goto failed_addition; /* * Fixup the number of isolated pageblocks before marking the sections * onlining, such that undo_isolate_page_range() works correctly. */ spin_lock_irqsave(&zone->lock, flags); zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ if (!populated_zone(zone)) { need_zonelists_rebuild = 1; setup_zone_pageset(zone); } online_pages_range(pfn, nr_pages); adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL); /* Basic onlining is complete, allow allocation of onlined pages. */ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); /* * Freshly onlined pages aren't shuffled (e.g., all pages are placed to * the tail of the freelist when undoing isolation). Shuffle the whole * zone to make sure the just onlined pages are properly distributed * across the whole freelist - to create an initial shuffle. */ shuffle_zone(zone); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); kswapd_run(nid); kcompactd_run(nid); writeback_set_ratelimit(); memory_notify(MEM_ONLINE, &arg); return 0; failed_addition: pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); return ret; } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; /* * NODE_DATA is preallocated (free_area_init) but its internal * state is not allocated completely. Add missing pieces. * Completely offline nodes stay around and they just need * reintialization. */ pgdat = NODE_DATA(nid); /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(pgdat); /* * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ build_all_zonelists(pgdat); return pgdat; } /* * __try_online_node - online a node if offlined * @nid: the node ID * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * * Returns: * 1 -> a new node has been allocated * 0 -> the node is already online * -ENOMEM -> the node could not be allocated */ static int __try_online_node(int nid, bool set_node_online) { pg_data_t *pgdat; int ret = 1; if (node_online(nid)) return 0; pgdat = hotadd_init_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } if (set_node_online) { node_set_online(nid); ret = register_one_node(nid); BUG_ON(ret); } out: return ret; } /* * Users of this function always want to online/register the node */ int try_online_node(int nid) { int ret; mem_hotplug_begin(); ret = __try_online_node(nid, true); mem_hotplug_done(); return ret; } static int check_hotplug_memory_range(u64 start, u64 size) { /* memory range must be block size aligned */ if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || !IS_ALIGNED(size, memory_block_size_bytes())) { pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", memory_block_size_bytes(), start, size); return -EINVAL; } return 0; } static int online_memory_block(struct memory_block *mem, void *arg) { mem->online_type = mhp_default_online_type; return device_online(&mem->dev); } #ifndef arch_supports_memmap_on_memory static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) { /* * As default, we want the vmemmap to span a complete PMD such that we * can map the vmemmap using a single PMD if supported by the * architecture. */ return IS_ALIGNED(vmemmap_size, PMD_SIZE); } #endif bool mhp_supports_memmap_on_memory(void) { unsigned long vmemmap_size = memory_block_memmap_size(); unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); /* * Besides having arch support and the feature enabled at runtime, we * need a few more assumptions to hold true: * * a) The vmemmap pages span complete PMDs: We don't want vmemmap code * to populate memory from the altmap for unrelated parts (i.e., * other memory blocks) * * b) The vmemmap pages (and thereby the pages that will be exposed to * the buddy) have to cover full pageblocks: memory onlining/offlining * code requires applicable ranges to be page-aligned, for example, to * set the migratetypes properly. * * TODO: Although we have a check here to make sure that vmemmap pages * fully populate a PMD, it is not the right place to check for * this. A much better solution involves improving vmemmap code * to fallback to base pages when trying to populate vmemmap using * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ if (!mhp_memmap_on_memory()) return false; /* * Make sure the vmemmap allocation is fully contained * so that we always allocate vmemmap memory from altmap area. */ if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) return false; /* * start pfn should be pageblock_nr_pages aligned for correctly * setting migrate types */ if (!pageblock_aligned(memmap_pages)) return false; if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) /* No effective hotplugged memory doesn't make sense. */ return false; return arch_supports_memmap_on_memory(vmemmap_size); } EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory); static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; /* * For memmap_on_memory, the altmaps were added on a per-memblock * basis; we have to process each individual memory block. */ for (cur_start = start; cur_start < start + size; cur_start += memblock_size) { struct vmem_altmap *altmap = NULL; struct memory_block *mem; mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(cur_start))); if (WARN_ON_ONCE(!mem)) continue; altmap = mem->altmap; mem->altmap = NULL; remove_memory_block_devices(cur_start, memblock_size); arch_remove_memory(cur_start, memblock_size, altmap); /* Verify that all vmemmap pages have actually been freed. */ WARN(altmap->alloc, "Altmap not fully unmapped"); kfree(altmap); } } static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, u64 start, u64 size, mhp_t mhp_flags) { unsigned long memblock_size = memory_block_size_bytes(); u64 cur_start; int ret; for (cur_start = start; cur_start < start + size; cur_start += memblock_size) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; struct vmem_altmap mhp_altmap = { .base_pfn = PHYS_PFN(cur_start), .end_pfn = PHYS_PFN(cur_start + memblock_size - 1), }; mhp_altmap.free = memory_block_memmap_on_memory_pages(); if (mhp_flags & MHP_OFFLINE_INACCESSIBLE) mhp_altmap.inaccessible = true; params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap), GFP_KERNEL); if (!params.altmap) { ret = -ENOMEM; goto out; } /* call arch's memory hotadd */ ret = arch_add_memory(nid, cur_start, memblock_size, &params); if (ret < 0) { kfree(params.altmap); goto out; } /* create memory block devices after memory was added */ ret = create_memory_block_devices(cur_start, memblock_size, params.altmap, group); if (ret) { arch_remove_memory(cur_start, memblock_size, NULL); kfree(params.altmap); goto out; } } return 0; out: if (ret && cur_start != start) remove_memory_blocks_and_altmaps(start, cur_start - start); return ret; } /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; struct memory_group *group = NULL; u64 start, size; bool new_node = false; int ret; start = res->start; size = resource_size(res); ret = check_hotplug_memory_range(start, size); if (ret) return ret; if (mhp_flags & MHP_NID_IS_MGID) { group = memory_group_find_by_id(nid); if (!group) return -EINVAL; nid = group->nid; } if (!node_possible(nid)) { WARN(1, "node %d was absent from the node_possible_map\n", nid); return -EINVAL; } mem_hotplug_begin(); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) memblock_flags = MEMBLOCK_DRIVER_MANAGED; ret = memblock_add_node(start, size, nid, memblock_flags); if (ret) goto error_mem_hotplug_end; } ret = __try_online_node(nid, false); if (ret < 0) goto error; new_node = ret; /* * Self hosted memmap array */ if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) && mhp_supports_memmap_on_memory()) { ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags); if (ret) goto error; } else { ret = arch_add_memory(nid, start, size, &params); if (ret < 0) goto error; /* create memory block devices after memory was added */ ret = create_memory_block_devices(start, size, NULL, group); if (ret) { arch_remove_memory(start, size, params.altmap); goto error; } } if (new_node) { /* If sysfs file of new node can't be created, cpu on the node * can't be hot-added. There is no rollback way now. * So, check by BUG_ON() to catch it reluctantly.. * We online node here. We can't roll back from here. */ node_set_online(nid); ret = __register_one_node(nid); BUG_ON(ret); } register_memory_blocks_under_node(nid, PFN_DOWN(start), PFN_UP(start + size - 1), MEMINIT_HOTPLUG); /* create new memmap entry */ if (!strcmp(res->name, "System RAM")) firmware_map_add_hotplug(start, start + size, "System RAM"); /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); /* * In case we're allowed to merge the resource, flag it and trigger * merging now that adding succeeded. */ if (mhp_flags & MHP_MERGE_RESOURCE) merge_system_ram_resource(res); /* online pages if requested */ if (mhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; error: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: mem_hotplug_done(); return ret; } /* requires device_hotplug_lock, see add_memory_resource() */ int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { struct resource *res; int ret; res = register_memory_resource(start, size, "System RAM"); if (IS_ERR(res)) return PTR_ERR(res); ret = add_memory_resource(nid, res, mhp_flags); if (ret < 0) release_memory_resource(res); return ret; } int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) { int rc; lock_device_hotplug(); rc = __add_memory(nid, start, size, mhp_flags); unlock_device_hotplug(); return rc; } EXPORT_SYMBOL_GPL(add_memory); /* * Add special, driver-managed memory to the system as system RAM. Such * memory is not exposed via the raw firmware-provided memmap as system * RAM, instead, it is detected and added by a driver - during cold boot, * after a reboot, and after kexec. * * Reasons why this memory should not be used for the initial memmap of a * kexec kernel or for placing kexec images: * - The booting kernel is in charge of determining how this memory will be * used (e.g., use persistent memory as system RAM) * - Coordination with a hypervisor is required before this memory * can be used (e.g., inaccessible parts). * * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided * memory map") are created. Also, the created memory resource is flagged * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case * this memory as well (esp., not place kexec images onto it). * * The resource_name (visible via /proc/iomem) has to have the format * "System RAM ($DRIVER)". */ int add_memory_driver_managed(int nid, u64 start, u64 size, const char *resource_name, mhp_t mhp_flags) { struct resource *res; int rc; if (!resource_name || strstr(resource_name, "System RAM (") != resource_name || resource_name[strlen(resource_name) - 1] != ')') return -EINVAL; lock_device_hotplug(); res = register_memory_resource(start, size, resource_name); if (IS_ERR(res)) { rc = PTR_ERR(res); goto out_unlock; } rc = add_memory_resource(nid, res, mhp_flags); if (rc < 0) release_memory_resource(res); out_unlock: unlock_device_hotplug(); return rc; } EXPORT_SYMBOL_GPL(add_memory_driver_managed); /* * Platforms should define arch_get_mappable_range() that provides * maximum possible addressable physical memory range for which the * linear mapping could be created. The platform returned address * range must adhere to these following semantics. * * - range.start <= range.end * - Range includes both end points [range.start..range.end] * * There is also a fallback definition provided here, allowing the * entire possible physical address range in case any platform does * not define arch_get_mappable_range(). */ struct range __weak arch_get_mappable_range(void) { struct range mhp_range = { .start = 0UL, .end = -1ULL, }; return mhp_range; } struct range mhp_get_pluggable_range(bool need_mapping) { const u64 max_phys = PHYSMEM_END; struct range mhp_range; if (need_mapping) { mhp_range = arch_get_mappable_range(); if (mhp_range.start > max_phys) { mhp_range.start = 0; mhp_range.end = 0; } mhp_range.end = min_t(u64, mhp_range.end, max_phys); } else { mhp_range.start = 0; mhp_range.end = max_phys; } return mhp_range; } EXPORT_SYMBOL_GPL(mhp_get_pluggable_range); bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) { struct range mhp_range = mhp_get_pluggable_range(need_mapping); u64 end = start + size; if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) return true; pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n", start, end, mhp_range.start, mhp_range.end); return false; } #ifdef CONFIG_MEMORY_HOTREMOVE /* * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, * non-lru movable pages and hugepages). Will skip over most unmovable * pages (esp., pages that can be skipped when offlining), but bail out on * definitely unmovable pages. * * Returns: * 0 in case a movable page is found and movable_pfn was updated. * -ENOENT in case no movable page was found. * -EBUSY in case a definitely unmovable page was found. */ static int scan_movable_pages(unsigned long start, unsigned long end, unsigned long *movable_pfn) { unsigned long pfn; for (pfn = start; pfn < end; pfn++) { struct page *page; struct folio *folio; if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); if (PageLRU(page)) goto found; if (__PageMovable(page)) goto found; /* * PageOffline() pages that are not marked __PageMovable() and * have a reference count > 0 (after MEM_GOING_OFFLINE) are * definitely unmovable. If their reference count would be 0, * they could at least be skipped when offlining memory. */ if (PageOffline(page) && page_count(page)) return -EBUSY; if (!PageHuge(page)) continue; folio = page_folio(page); /* * This test is racy as we hold no reference or lock. The * hugetlb page could have been free'ed and head is no longer * a hugetlb page before the following check. In such unlikely * cases false positives and negatives are possible. Calling * code must deal with these scenarios. */ if (folio_test_hugetlb_migratable(folio)) goto found; pfn |= folio_nr_pages(folio) - 1; } return -ENOENT; found: *movable_pfn = pfn; return 0; } static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page, *head; LIST_HEAD(source); static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct folio *folio; bool isolated; if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); folio = page_folio(page); head = &folio->page; if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; isolate_hugetlb(folio, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; /* * HWPoison pages have elevated reference counts so the migration would * fail on them. It also doesn't make any sense to migrate them in the * first place. Still try to unmap such a page in case it is still mapped * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep * the unmap as the catch all safety net). */ if (PageHWPoison(page)) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); if (folio_mapped(folio)) try_to_unmap(folio, TTU_IGNORE_MLOCK); continue; } if (!get_page_unless_zero(page)) continue; /* * We can skip free pages. And we can deal with pages on * LRU and non-lru movable pages. */ if (PageLRU(page)) isolated = isolate_lru_page(page); else isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (isolated) { list_add_tail(&page->lru, &source); if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); } else { if (__ratelimit(&migrate_rs)) { pr_warn("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); } } put_page(page); } if (!list_empty(&source)) { nodemask_t nmask = node_states[N_MEMORY]; struct migration_target_control mtc = { .nmask = &nmask, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, .reason = MR_MEMORY_HOTPLUG, }; int ret; /* * We have checked that migration range is on a single zone so * we can use the nid of the first page to all the others. */ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); /* * try to allocate from a different node but reuse this node * if there are no other online nodes to be used (e.g. we are * offlining a part of the only existing node) */ node_clear(mtc.nid, nmask); if (nodes_empty(nmask)) node_set(mtc.nid, nmask); ret = migrate_pages(&source, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { list_for_each_entry(page, &source, lru) { if (__ratelimit(&migrate_rs)) { pr_warn("migrating pfn %lx failed ret:%d\n", page_to_pfn(page), ret); dump_page(page, "migration failure"); } } putback_movable_pages(&source); } } } static int __init cmdline_parse_movable_node(char *p) { movable_node_enabled = true; return 0; } early_param("movable_node", cmdline_parse_movable_node); /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long present_pages = 0; enum zone_type zt; arg->status_change_nid = NUMA_NO_NODE; arg->status_change_nid_normal = NUMA_NO_NODE; /* * Check whether node_states[N_NORMAL_MEMORY] will be changed. * If the memory to be offline is within the range * [0..ZONE_NORMAL], and it is the last present memory there, * the zones in that range will become empty after the offlining, * thus we can determine that we need to clear the node from * node_states[N_NORMAL_MEMORY]. */ for (zt = 0; zt <= ZONE_NORMAL; zt++) present_pages += pgdat->node_zones[zt].present_pages; if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); /* * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM * does not apply as we don't support 32bit. * Here we count the possible pages from ZONE_MOVABLE. * If after having accounted all the pages, we see that the nr_pages * to be offlined is over or equal to the accounted pages, * we know that the node will become empty, and so, we can clear * it for N_MEMORY as well. */ present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); } static void node_states_clear_node(int node, struct memory_notify *arg) { if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); if (arg->status_change_nid >= 0) node_clear_state(node, N_MEMORY); } static int count_system_ram_pages_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { unsigned long *nr_system_ram_pages = data; *nr_system_ram_pages += nr_pages; return 0; } /* * Must be called with mem_hotplug_lock in write mode. */ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn, managed_pages, system_ram_pages = 0; const int node = zone_to_nid(zone); unsigned long flags; struct memory_notify arg; char *reason; int ret; /* * {on,off}lining is constrained to full memory sections (or more * precisely to memory blocks from the user space POV). * memmap_on_memory is an exception because it reserves initial part * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; /* * Don't allow to offline memory blocks that contain holes. * Consequently, memory blocks with holes can never get onlined * via the hotplug path - online_pages() - as hotplugged memory has * no holes. This way, we don't have to worry about memory holes, * don't need pfn_valid() checks, and can avoid using * walk_system_ram_range() later. */ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, count_system_ram_pages_cb); if (system_ram_pages != nr_pages) { ret = -EINVAL; reason = "memory holes"; goto failed_removal; } /* * We only support offlining of memory blocks managed by a single zone, * checked by calling code. This is just a sanity check that we might * want to remove in the future. */ if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone || page_zone(pfn_to_page(end_pfn - 1)) != zone)) { ret = -EINVAL; reason = "multizone range"; goto failed_removal; } /* * Disable pcplists so that page isolation cannot race with freeing * in a way that pages from isolated pageblock are left on pcplists. */ zone_pcp_disable(zone); lru_cache_disable(); /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, MEMORY_OFFLINE | REPORT_FAILURE, GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); if (ret) { reason = "failure to isolate range"; goto failed_removal_pcplists_disabled; } arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; node_states_check_changes_offline(nr_pages, zone, &arg); ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); if (ret) { reason = "notifier failure"; goto failed_removal_isolated; } do { pfn = start_pfn; do { /* * Historically we always checked for any signal and * can't limit it to fatal signals without eventually * breaking user space. */ if (signal_pending(current)) { ret = -EINTR; reason = "signal backoff"; goto failed_removal_isolated; } cond_resched(); ret = scan_movable_pages(pfn, end_pfn, &pfn); if (!ret) { /* * TODO: fatal migration failures should bail * out */ do_migrate_range(pfn, end_pfn); } } while (!ret); if (ret != -ENOENT) { reason = "unmovable page"; goto failed_removal_isolated; } /* * Dissolve free hugetlb folios in the memory block before doing * offlining actually in order to make hugetlbfs's object * counting consistent. */ ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn); if (ret) { reason = "failure to dissolve huge pages"; goto failed_removal_isolated; } ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); } while (ret); /* Mark all sections offline and remove free pages from the buddy. */ managed_pages = __offline_isolated_pages(start_pfn, end_pfn); pr_debug("Offlined Pages %ld\n", nr_pages); /* * The memory sections are marked offline, and the pageblock flags * effectively stale; nobody should be touching them. Fixup the number * of isolated pageblocks, memory onlining will properly revert this. */ spin_lock_irqsave(&zone->lock, flags); zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); lru_cache_enable(); zone_pcp_enable(zone); /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -managed_pages); adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); /* * Make sure to mark the node as memory-less before rebuilding the zone * list. Otherwise this node would still appear in the fallback lists. */ node_states_clear_node(node, &arg); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); } if (arg.status_change_nid >= 0) { kcompactd_stop(node); kswapd_stop(node); } writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); remove_pfn_range_from_zone(zone, start_pfn, nr_pages); return 0; failed_removal_isolated: /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal_pcplists_disabled: lru_cache_enable(); zone_pcp_enable(zone); failed_removal: pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); return ret; } static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int *nid = arg; *nid = mem->nid; if (unlikely(mem->state != MEM_OFFLINE)) { phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); endpa = beginpa + memory_block_size_bytes() - 1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); return -EBUSY; } return 0; } static int count_memory_range_altmaps_cb(struct memory_block *mem, void *arg) { u64 *num_altmaps = (u64 *)arg; if (mem->altmap) *num_altmaps += 1; return 0; } static int check_cpu_on_node(int nid) { int cpu; for_each_present_cpu(cpu) { if (cpu_to_node(cpu) == nid) /* * the cpu on this node isn't removed, and we can't * offline this node. */ return -EBUSY; } return 0; } static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) { int nid = *(int *)arg; /* * If a memory block belongs to multiple nodes, the stored nid is not * reliable. However, such blocks are always online (e.g., cannot get * offlined) and, therefore, are still spanned by the node. */ return mem->nid == nid ? -EEXIST : 0; } /** * try_offline_node * @nid: the node ID * * Offline a node if all memory sections and cpus of the node are removed. * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call. */ void try_offline_node(int nid) { int rc; /* * If the node still spans pages (especially ZONE_DEVICE), don't * offline it. A node spans memory after move_pfn_range_to_zone(), * e.g., after the memory block was onlined. */ if (node_spanned_pages(nid)) return; /* * Especially offline memory blocks might not be spanned by the * node. They will get spanned by the node once they get onlined. * However, they link to the node in sysfs and can get onlined later. */ rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); if (rc) return; if (check_cpu_on_node(nid)) return; /* * all memory/cpu of this node are removed, we can offline this * node now. */ node_set_offline(nid); unregister_one_node(nid); } EXPORT_SYMBOL(try_offline_node); static int memory_blocks_have_altmaps(u64 start, u64 size) { u64 num_memblocks = size / memory_block_size_bytes(); u64 num_altmaps = 0; if (!mhp_memmap_on_memory()) return 0; walk_memory_blocks(start, size, &num_altmaps, count_memory_range_altmaps_cb); if (num_altmaps == 0) return 0; if (WARN_ON_ONCE(num_memblocks != num_altmaps)) return -EINVAL; return 1; } static int __ref try_remove_memory(u64 start, u64 size) { int rc, nid = NUMA_NO_NODE; BUG_ON(check_hotplug_memory_range(start, size)); /* * All memory blocks must be offlined before removing memory. Check * whether all memory blocks in question are offline and return error * if this is not the case. * * While at it, determine the nid. Note that if we'd have mixed nodes, * we'd only try to offline the last determined one -- which is good * enough for the cases we care about. */ rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); if (rc) return rc; /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); mem_hotplug_begin(); rc = memory_blocks_have_altmaps(start, size); if (rc < 0) { mem_hotplug_done(); return rc; } else if (!rc) { /* * Memory block device removal under the device_hotplug_lock is * a barrier against racing online attempts. * No altmaps present, do the removal directly */ remove_memory_block_devices(start, size); arch_remove_memory(start, size, NULL); } else { /* all memblocks in the range have altmaps */ remove_memory_blocks_and_altmaps(start, size); } if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); release_mem_region_adjustable(start, size); if (nid != NUMA_NO_NODE) try_offline_node(nid); mem_hotplug_done(); return 0; } /** * __remove_memory - Remove memory if every memory block is offline * @start: physical address of the region to remove * @size: size of the region to remove * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by * try_offline_node(). */ void __remove_memory(u64 start, u64 size) { /* * trigger BUG() if some memory is not offlined prior to calling this * function */ if (try_remove_memory(start, size)) BUG(); } /* * Remove memory if every memory block is offline, otherwise return -EBUSY is * some memory is not offline */ int remove_memory(u64 start, u64 size) { int rc; lock_device_hotplug(); rc = try_remove_memory(start, size); unlock_device_hotplug(); return rc; } EXPORT_SYMBOL_GPL(remove_memory); static int try_offline_memory_block(struct memory_block *mem, void *arg) { uint8_t online_type = MMOP_ONLINE_KERNEL; uint8_t **online_types = arg; struct page *page; int rc; /* * Sense the online_type via the zone of the memory block. Offlining * with multiple zones within one memory block will be rejected * by offlining code ... so we don't care about that. */ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) online_type = MMOP_ONLINE_MOVABLE; rc = device_offline(&mem->dev); /* * Default is MMOP_OFFLINE - change it only if offlining succeeded, * so try_reonline_memory_block() can do the right thing. */ if (!rc) **online_types = online_type; (*online_types)++; /* Ignore if already offline. */ return rc < 0 ? rc : 0; } static int try_reonline_memory_block(struct memory_block *mem, void *arg) { uint8_t **online_types = arg; int rc; if (**online_types != MMOP_OFFLINE) { mem->online_type = **online_types; rc = device_online(&mem->dev); if (rc < 0) pr_warn("%s: Failed to re-online memory: %d", __func__, rc); } /* Continue processing all remaining memory blocks. */ (*online_types)++; return 0; } /* * Try to offline and remove memory. Might take a long time to finish in case * memory is still in use. Primarily useful for memory devices that logically * unplugged all memory (so it's no longer in use) and want to offline + remove * that memory. */ int offline_and_remove_memory(u64 start, u64 size) { const unsigned long mb_count = size / memory_block_size_bytes(); uint8_t *online_types, *tmp; int rc; if (!IS_ALIGNED(start, memory_block_size_bytes()) || !IS_ALIGNED(size, memory_block_size_bytes()) || !size) return -EINVAL; /* * We'll remember the old online type of each memory block, so we can * try to revert whatever we did when offlining one memory block fails * after offlining some others succeeded. */ online_types = kmalloc_array(mb_count, sizeof(*online_types), GFP_KERNEL); if (!online_types) return -ENOMEM; /* * Initialize all states to MMOP_OFFLINE, so when we abort processing in * try_offline_memory_block(), we'll skip all unprocessed blocks in * try_reonline_memory_block(). */ memset(online_types, MMOP_OFFLINE, mb_count); lock_device_hotplug(); tmp = online_types; rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); /* * In case we succeeded to offline all memory, remove it. * This cannot fail as it cannot get onlined in the meantime. */ if (!rc) { rc = try_remove_memory(start, size); if (rc) pr_err("%s: Failed to remove memory: %d", __func__, rc); } /* * Rollback what we did. While memory onlining might theoretically fail * (nacked by a notifier), it barely ever happens. */ if (rc) { tmp = online_types; walk_memory_blocks(start, size, &tmp, try_reonline_memory_block); } unlock_device_hotplug(); kfree(online_types); return rc; } EXPORT_SYMBOL_GPL(offline_and_remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */
16 14 15 15 16 21 21 22 9 5 2 1 1 2 4 2 1 7 1 1 2 1 1 13 13 7 1 13 12 1 1 13 10 10 3 13 13 13 1 12 13 13 12 6 8 5 4 3 2 2 1 4 4 4 5 4 4 1 5 5 4 18 5 2 1 1 12 10 9 8 13 3 2 1 2 1 1 5 4 3 2 1 1 47 45 5 3 4 5 2 5 13 13 4 4 4 18 48 49 48 47 40 2 1 4 3 3 3 51 18 17 7 7 6 5 9 7 4 3 2 5 4 4 2 3 4 17 18 12 20 20 20 22 22 20 22 56 19 3 3 3 3 2 2 1 3 3 1 1 1 1 1 1 1 3 1 2 2 1 3 3 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/in.h> #include <linux/ipv6.h> #include <linux/poll.h> #include <net/sock.h> #include "rds.h" /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); static unsigned long rds_sock_count; static LIST_HEAD(rds_sock_list); DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); /* * This is called as the final descriptor referencing this socket is closed. * We have to unbind the socket so that another socket can be bound to the * address it was using. * * We have to be careful about racing with the incoming path. sock_orphan() * sets SOCK_DEAD and we use that as an indicator to the rx path that new * messages shouldn't be queued. */ static int rds_release(struct socket *sock) { struct sock *sk = sock->sk; struct rds_sock *rs; if (!sk) goto out; rs = rds_sk_to_rs(sk); sock_orphan(sk); /* Note - rds_clear_recv_queue grabs rs_recv_lock, so * that ensures the recv path has completed messing * with the socket. */ rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); rds_remove_bound(rs); rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); rds_sock_count--; spin_unlock_bh(&rds_sock_lock); rds_trans_put(rs->rs_transport); sock->sk = NULL; sock_put(sk); out: return 0; } /* * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but * this seems more conservative. * NB - normally, one would use sk_callback_lock for this, but we can * get here from interrupts, whereas the network code grabs sk_callback_lock * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. */ void rds_wake_sk_sleep(struct rds_sock *rs) { unsigned long flags; read_lock_irqsave(&rs->rs_recv_lock, flags); __rds_wake_sk_sleep(rds_rs_to_sk(rs)); read_unlock_irqrestore(&rs->rs_recv_lock, flags); } static int rds_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); struct sockaddr_in6 *sin6; struct sockaddr_in *sin; int uaddr_len; /* racey, don't care */ if (peer) { if (ipv6_addr_any(&rs->rs_conn_addr)) return -ENOTCONN; if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { sin = (struct sockaddr_in *)uaddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); sin->sin_family = AF_INET; sin->sin_port = rs->rs_conn_port; sin->sin_addr.s_addr = rs->rs_conn_addr_v4; uaddr_len = sizeof(*sin); } else { sin6 = (struct sockaddr_in6 *)uaddr; sin6->sin6_family = AF_INET6; sin6->sin6_port = rs->rs_conn_port; sin6->sin6_addr = rs->rs_conn_addr; sin6->sin6_flowinfo = 0; /* scope_id is the same as in the bound address. */ sin6->sin6_scope_id = rs->rs_bound_scope_id; uaddr_len = sizeof(*sin6); } } else { /* If socket is not yet bound and the socket is connected, * set the return address family to be the same as the * connected address, but with 0 address value. If it is not * connected, set the family to be AF_UNSPEC (value 0) and * the address size to be that of an IPv4 address. */ if (ipv6_addr_any(&rs->rs_bound_addr)) { if (ipv6_addr_any(&rs->rs_conn_addr)) { sin = (struct sockaddr_in *)uaddr; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_UNSPEC; return sizeof(*sin); } #if IS_ENABLED(CONFIG_IPV6) if (!(ipv6_addr_type(&rs->rs_conn_addr) & IPV6_ADDR_MAPPED)) { sin6 = (struct sockaddr_in6 *)uaddr; memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; return sizeof(*sin6); } #endif sin = (struct sockaddr_in *)uaddr; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; return sizeof(*sin); } if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { sin = (struct sockaddr_in *)uaddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); sin->sin_family = AF_INET; sin->sin_port = rs->rs_bound_port; sin->sin_addr.s_addr = rs->rs_bound_addr_v4; uaddr_len = sizeof(*sin); } else { sin6 = (struct sockaddr_in6 *)uaddr; sin6->sin6_family = AF_INET6; sin6->sin6_port = rs->rs_bound_port; sin6->sin6_addr = rs->rs_bound_addr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = rs->rs_bound_scope_id; uaddr_len = sizeof(*sin6); } } return uaddr_len; } /* * RDS' poll is without a doubt the least intuitive part of the interface, * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from * a network protocol. * * EPOLLIN is asserted if * - there is data on the receive queue. * - to signal that a previously congested destination may have become * uncongested * - A notification has been queued to the socket (this can be a congestion * update, or a RDMA completion, or a MSG_ZEROCOPY completion). * * EPOLLOUT is asserted if there is room on the send queue. This does not mean * however, that the next sendmsg() call will succeed. If the application tries * to send to a congested destination, the system call may still fail (and * return ENOBUFS). */ static __poll_t rds_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); __poll_t mask = 0; unsigned long flags; poll_wait(file, sk_sleep(sk), wait); if (rs->rs_seen_congestion) poll_wait(file, &rds_poll_waitq, wait); read_lock_irqsave(&rs->rs_recv_lock, flags); if (!rs->rs_cong_monitor) { /* When a congestion map was updated, we signal EPOLLIN for * "historical" reasons. Applications can also poll for * WRBAND instead. */ if (rds_cong_updated_since(&rs->rs_cong_track)) mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND); } else { spin_lock(&rs->rs_lock); if (rs->rs_cong_notify) mask |= (EPOLLIN | EPOLLRDNORM); spin_unlock(&rs->rs_lock); } if (!list_empty(&rs->rs_recv_queue) || !list_empty(&rs->rs_notify_queue) || !list_empty(&rs->rs_zcookie_queue.zcookie_head)) mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ if (mask) rs->rs_seen_congestion = 0; return mask; } static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); rds_tos_t utos, tos = 0; switch (cmd) { case SIOCRDSSETTOS: if (get_user(utos, (rds_tos_t __user *)arg)) return -EFAULT; if (rs->rs_transport && rs->rs_transport->get_tos_map) tos = rs->rs_transport->get_tos_map(utos); else return -ENOIOCTLCMD; spin_lock_bh(&rds_sock_lock); if (rs->rs_tos || rs->rs_conn) { spin_unlock_bh(&rds_sock_lock); return -EINVAL; } rs->rs_tos = tos; spin_unlock_bh(&rds_sock_lock); break; case SIOCRDSGETTOS: spin_lock_bh(&rds_sock_lock); tos = rs->rs_tos; spin_unlock_bh(&rds_sock_lock); if (put_user(tos, (rds_tos_t __user *)arg)) return -EFAULT; break; default: return -ENOIOCTLCMD; } return 0; } static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) { struct sockaddr_in6 sin6; struct sockaddr_in sin; int ret = 0; /* racing with another thread binding seems ok here */ if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } if (len < sizeof(struct sockaddr_in)) { ret = -EINVAL; goto out; } else if (len < sizeof(struct sockaddr_in6)) { /* Assume IPv4 */ if (copy_from_sockptr(&sin, optval, sizeof(struct sockaddr_in))) { ret = -EFAULT; goto out; } ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); sin6.sin6_port = sin.sin_port; } else { if (copy_from_sockptr(&sin6, optval, sizeof(struct sockaddr_in6))) { ret = -EFAULT; goto out; } } rds_send_drop_to(rs, &sin6); out: return ret; } static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, int optlen) { int value; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&value, optval, sizeof(int))) return -EFAULT; *optvar = !!value; return 0; } static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) { int ret; ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); if (ret == 0) { if (rs->rs_cong_monitor) { rds_cong_add_socket(rs); } else { rds_cong_remove_socket(rs); rs->rs_cong_mask = 0; rs->rs_cong_notify = 0; } } return ret; } static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen) { int t_type; if (rs->rs_transport) return -EOPNOTSUPP; /* previously attached to transport */ if (optlen != sizeof(int)) return -EINVAL; if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) return -EFAULT; if (t_type < 0 || t_type >= RDS_TRANS_COUNT) return -EINVAL; rs->rs_transport = rds_trans_get(t_type); return rs->rs_transport ? 0 : -ENOPROTOOPT; } static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, int optlen, int optname) { int val, valbool; if (optlen != sizeof(int)) return -EFAULT; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; valbool = val ? 1 : 0; if (optname == SO_TIMESTAMP_NEW) sock_set_flag(sk, SOCK_TSTAMP_NEW); if (valbool) sock_set_flag(sk, SOCK_RCVTSTAMP); else sock_reset_flag(sk, SOCK_RCVTSTAMP); return 0; } static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_rx_trace_so trace; int i; if (optlen != sizeof(struct rds_rx_trace_so)) return -EFAULT; if (copy_from_sockptr(&trace, optval, sizeof(trace))) return -EFAULT; if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) return -EFAULT; rs->rs_rx_traces = trace.rx_traces; for (i = 0; i < rs->rs_rx_traces; i++) { if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { rs->rs_rx_traces = 0; return -EFAULT; } rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; } return 0; } static int rds_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret; if (level != SOL_RDS) { ret = -ENOPROTOOPT; goto out; } switch (optname) { case RDS_CANCEL_SENT_TO: ret = rds_cancel_sent_to(rs, optval, optlen); break; case RDS_GET_MR: ret = rds_get_mr(rs, optval, optlen); break; case RDS_GET_MR_FOR_DEST: ret = rds_get_mr_for_dest(rs, optval, optlen); break; case RDS_FREE_MR: ret = rds_free_mr(rs, optval, optlen); break; case RDS_RECVERR: ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); break; case RDS_CONG_MONITOR: ret = rds_cong_monitor(rs, optval, optlen); break; case SO_RDS_TRANSPORT: lock_sock(sock->sk); ret = rds_set_transport(rs, optval, optlen); release_sock(sock->sk); break; case SO_TIMESTAMP_OLD: case SO_TIMESTAMP_NEW: lock_sock(sock->sk); ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname); release_sock(sock->sk); break; case SO_RDS_MSG_RXPATH_LATENCY: ret = rds_recv_track_latency(rs, optval, optlen); break; default: ret = -ENOPROTOOPT; } out: return ret; } static int rds_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct rds_sock *rs = rds_sk_to_rs(sock->sk); int ret = -ENOPROTOOPT, len; int trans; if (level != SOL_RDS) goto out; if (get_user(len, optlen)) { ret = -EFAULT; goto out; } switch (optname) { case RDS_INFO_FIRST ... RDS_INFO_LAST: ret = rds_info_getsockopt(sock, optname, optval, optlen); break; case RDS_RECVERR: if (len < sizeof(int)) ret = -EINVAL; else if (put_user(rs->rs_recverr, (int __user *) optval) || put_user(sizeof(int), optlen)) ret = -EFAULT; else ret = 0; break; case SO_RDS_TRANSPORT: if (len < sizeof(int)) { ret = -EINVAL; break; } trans = (rs->rs_transport ? rs->rs_transport->t_type : RDS_TRANS_NONE); /* unbound */ if (put_user(trans, (int __user *)optval) || put_user(sizeof(int), optlen)) ret = -EFAULT; else ret = 0; break; default: break; } out: return ret; } static int rds_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_in *sin; struct rds_sock *rs = rds_sk_to_rs(sk); int ret = 0; if (addr_len < offsetofend(struct sockaddr, sa_family)) return -EINVAL; lock_sock(sk); switch (uaddr->sa_family) { case AF_INET: sin = (struct sockaddr_in *)uaddr; if (addr_len < sizeof(struct sockaddr_in)) { ret = -EINVAL; break; } if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { ret = -EDESTADDRREQ; break; } if (ipv4_is_multicast(sin->sin_addr.s_addr) || sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { ret = -EINVAL; break; } ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); rs->rs_conn_port = sin->sin_port; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6; int addr_type; sin6 = (struct sockaddr_in6 *)uaddr; if (addr_len < sizeof(struct sockaddr_in6)) { ret = -EINVAL; break; } addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) { ret = -EPROTOTYPE; break; } /* It is a mapped address. Need to do some sanity * checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) { ret = -EPROTOTYPE; break; } } if (addr_type & IPV6_ADDR_LINKLOCAL) { /* If socket is arleady bound to a link local address, * the peer address must be on the same link. */ if (sin6->sin6_scope_id == 0 || (!ipv6_addr_any(&rs->rs_bound_addr) && rs->rs_bound_scope_id && sin6->sin6_scope_id != rs->rs_bound_scope_id)) { ret = -EINVAL; break; } /* Remember the connected address scope ID. It will * be checked against the binding local address when * the socket is bound. */ rs->rs_bound_scope_id = sin6->sin6_scope_id; } rs->rs_conn_addr = sin6->sin6_addr; rs->rs_conn_port = sin6->sin6_port; break; } #endif default: ret = -EAFNOSUPPORT; break; } release_sock(sk); return ret; } static struct proto rds_proto = { .name = "RDS", .owner = THIS_MODULE, .obj_size = sizeof(struct rds_sock), }; static const struct proto_ops rds_proto_ops = { .family = AF_RDS, .owner = THIS_MODULE, .release = rds_release, .bind = rds_bind, .connect = rds_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = rds_getname, .poll = rds_poll, .ioctl = rds_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = rds_setsockopt, .getsockopt = rds_getsockopt, .sendmsg = rds_sendmsg, .recvmsg = rds_recvmsg, .mmap = sock_no_mmap, }; static void rds_sock_destruct(struct sock *sk) { struct rds_sock *rs = rds_sk_to_rs(sk); WARN_ON((&rs->rs_item != rs->rs_item.next || &rs->rs_item != rs->rs_item.prev)); } static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { struct rds_sock *rs; sock_init_data(sock, sk); sock->ops = &rds_proto_ops; sk->sk_protocol = protocol; sk->sk_destruct = rds_sock_destruct; rs = rds_sk_to_rs(sk); spin_lock_init(&rs->rs_lock); rwlock_init(&rs->rs_recv_lock); INIT_LIST_HEAD(&rs->rs_send_queue); INIT_LIST_HEAD(&rs->rs_recv_queue); INIT_LIST_HEAD(&rs->rs_notify_queue); INIT_LIST_HEAD(&rs->rs_cong_list); rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; rs->rs_tos = 0; rs->rs_conn = NULL; spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); rds_sock_count++; spin_unlock_bh(&rds_sock_lock); return 0; } static int rds_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_SEQPACKET || protocol) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern); if (!sk) return -ENOMEM; return __rds_create(sock, sk, protocol); } void rds_sock_addref(struct rds_sock *rs) { sock_hold(rds_rs_to_sk(rs)); } void rds_sock_put(struct rds_sock *rs) { sock_put(rds_rs_to_sk(rs)); } static const struct net_proto_family rds_family_ops = { .family = AF_RDS, .create = rds_create, .owner = THIS_MODULE, }; static void rds_sock_inc_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_sock *rs; struct rds_incoming *inc; unsigned int total = 0; len /= sizeof(struct rds_info_message); spin_lock_bh(&rds_sock_lock); list_for_each_entry(rs, &rds_sock_list, rs_item) { /* This option only supports IPv4 sockets. */ if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) continue; read_lock(&rs->rs_recv_lock); /* XXX too lazy to maintain counts.. */ list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { total++; if (total <= len) rds_inc_info_copy(inc, iter, inc->i_saddr.s6_addr32[3], rs->rs_bound_addr_v4, 1); } read_unlock(&rs->rs_recv_lock); } spin_unlock_bh(&rds_sock_lock); lens->nr = total; lens->each = sizeof(struct rds_info_message); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_sock_inc_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_incoming *inc; unsigned int total = 0; struct rds_sock *rs; len /= sizeof(struct rds6_info_message); spin_lock_bh(&rds_sock_lock); list_for_each_entry(rs, &rds_sock_list, rs_item) { read_lock(&rs->rs_recv_lock); list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { total++; if (total <= len) rds6_inc_info_copy(inc, iter, &inc->i_saddr, &rs->rs_bound_addr, 1); } read_unlock(&rs->rs_recv_lock); } spin_unlock_bh(&rds_sock_lock); lens->nr = total; lens->each = sizeof(struct rds6_info_message); } #endif static void rds_sock_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_info_socket sinfo; unsigned int cnt = 0; struct rds_sock *rs; len /= sizeof(struct rds_info_socket); spin_lock_bh(&rds_sock_lock); if (len < rds_sock_count) { cnt = rds_sock_count; goto out; } list_for_each_entry(rs, &rds_sock_list, rs_item) { /* This option only supports IPv4 sockets. */ if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) continue; sinfo.sndbuf = rds_sk_sndbuf(rs); sinfo.rcvbuf = rds_sk_rcvbuf(rs); sinfo.bound_addr = rs->rs_bound_addr_v4; sinfo.connected_addr = rs->rs_conn_addr_v4; sinfo.bound_port = rs->rs_bound_port; sinfo.connected_port = rs->rs_conn_port; sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); rds_info_copy(iter, &sinfo, sizeof(sinfo)); cnt++; } out: lens->nr = cnt; lens->each = sizeof(struct rds_info_socket); spin_unlock_bh(&rds_sock_lock); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_sock_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds6_info_socket sinfo6; struct rds_sock *rs; len /= sizeof(struct rds6_info_socket); spin_lock_bh(&rds_sock_lock); if (len < rds_sock_count) goto out; list_for_each_entry(rs, &rds_sock_list, rs_item) { sinfo6.sndbuf = rds_sk_sndbuf(rs); sinfo6.rcvbuf = rds_sk_rcvbuf(rs); sinfo6.bound_addr = rs->rs_bound_addr; sinfo6.connected_addr = rs->rs_conn_addr; sinfo6.bound_port = rs->rs_bound_port; sinfo6.connected_port = rs->rs_conn_port; sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); } out: lens->nr = rds_sock_count; lens->each = sizeof(struct rds6_info_socket); spin_unlock_bh(&rds_sock_lock); } #endif static void rds_exit(void) { sock_unregister(rds_family_ops.family); proto_unregister(&rds_proto); rds_conn_exit(); rds_cong_exit(); rds_sysctl_exit(); rds_threads_exit(); rds_stats_exit(); rds_page_exit(); rds_bind_lock_destroy(); rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); #endif } module_exit(rds_exit); u32 rds_gen_num; static int __init rds_init(void) { int ret; net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); ret = rds_bind_lock_init(); if (ret) goto out; ret = rds_conn_init(); if (ret) goto out_bind; ret = rds_threads_init(); if (ret) goto out_conn; ret = rds_sysctl_init(); if (ret) goto out_threads; ret = rds_stats_init(); if (ret) goto out_sysctl; ret = proto_register(&rds_proto, 1); if (ret) goto out_stats; ret = sock_register(&rds_family_ops); if (ret) goto out_proto; rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); #endif goto out; out_proto: proto_unregister(&rds_proto); out_stats: rds_stats_exit(); out_sysctl: rds_sysctl_exit(); out_threads: rds_threads_exit(); out_conn: rds_conn_exit(); rds_cong_exit(); rds_page_exit(); out_bind: rds_bind_lock_destroy(); out: return ret; } module_init(rds_init); #define DRV_VERSION "4.0" #define DRV_RELDATE "Feb 12, 2009" MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" " v" DRV_VERSION " (" DRV_RELDATE ")"); MODULE_VERSION(DRV_VERSION); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NETPROTO(PF_RDS);
15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 /* * Copyright (c) 2004 The Regents of the University of Michigan. * Copyright (c) 2012 Jeff Layton <jlayton@redhat.com> * All rights reserved. * * Andy Adamson <andros@citi.umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include <crypto/hash.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/module.h> #include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/clnt.h> #include <linux/nfsd/cld.h> #include "nfsd.h" #include "state.h" #include "vfs.h" #include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_PROC /* Declarations */ struct nfsd4_client_tracking_ops { int (*init)(struct net *); void (*exit)(struct net *); void (*create)(struct nfs4_client *); void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); void (*grace_done)(struct nfsd_net *); uint8_t version; size_t msglen; }; static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static int nfs4_save_creds(const struct cred **original_creds) { struct cred *new; new = prepare_creds(); if (!new) return -ENOMEM; new->fsuid = GLOBAL_ROOT_UID; new->fsgid = GLOBAL_ROOT_GID; *original_creds = override_creds(new); put_cred(new); return 0; } static void nfs4_reset_creds(const struct cred *original) { revert_creds(original); } static void md5_to_hex(char *out, char *md5) { int i; for (i=0; i<16; i++) { unsigned char c = md5[i]; *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); } *out = '\0'; } static int nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) { struct xdr_netobj cksum; struct crypto_shash *tfm; int status; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); tfm = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(tfm)) { status = PTR_ERR(tfm); goto out_no_tfm; } cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) { status = -ENOMEM; goto out; } status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, cksum.data); if (status) goto out; md5_to_hex(dname, cksum.data); status = 0; out: kfree(cksum.data); crypto_free_shash(tfm); out_no_tfm: return status; } /* * If we had an error generating the recdir name for the legacy tracker * then warn the admin. If the error doesn't appear to be transient, * then disable recovery tracking. */ static void legacy_recdir_name_error(struct nfs4_client *clp, int error) { printk(KERN_ERR "NFSD: unable to generate recoverydir " "name (%d).\n", error); /* * if the algorithm just doesn't exist, then disable the recovery * tracker altogether. The crypto libs will generally return this if * FIPS is enabled as well. */ if (error == -ENOENT) { printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " "Reboot recovery will not function correctly!\n"); nfsd4_client_tracking_exit(clp->net); } } static void __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, const char *dname, int len, struct nfsd_net *nn) { struct xdr_netobj name; struct xdr_netobj princhash = { .len = 0, .data = NULL }; struct nfs4_client_reclaim *crp; name.data = kmemdup(dname, len, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); return; } name.len = len; crp = nfs4_client_to_reclaim(name, princhash, nn); if (!crp) { kfree(name.data); return; } crp->cr_clp = clp; } static void nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char dname[HEXDIR_LEN]; struct dentry *dir, *dentry; int status; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; if (!nn->rec_file) return; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return legacy_recdir_name_error(clp, status); status = nfs4_save_creds(&original_cred); if (status < 0) return; status = mnt_want_write_file(nn->rec_file); if (status) goto out_creds; dir = nn->rec_file->f_path.dentry; /* lock the parent */ inode_lock(d_inode(dir)); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } if (d_really_is_positive(dentry)) /* * In the 4.1 case, where we're called from * reclaim_complete(), records from the previous reboot * may still be left, so this is OK. * * In the 4.0 case, we should never get here; but we may * as well be forgiving and just succeed silently. */ goto out_put; status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: inode_unlock(d_inode(dir)); if (status == 0) { if (nn->in_grace) __nfsd4_create_reclaim_record_grace(clp, dname, HEXDIR_LEN, nn); vfs_fsync(nn->rec_file, 0); } else { printk(KERN_ERR "NFSD: failed to write recovery record" " (err %d); please check that %s exists" " and is writeable", status, user_recovery_dirname); } mnt_drop_write_file(nn->rec_file); out_creds: nfs4_reset_creds(original_cred); } typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *); struct name_list { char name[HEXDIR_LEN]; struct list_head list; }; struct nfs4_dir_ctx { struct dir_context ctx; struct list_head names; }; static bool nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct nfs4_dir_ctx *ctx = container_of(__ctx, struct nfs4_dir_ctx, ctx); struct name_list *entry; if (namlen != HEXDIR_LEN - 1) return true; entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); if (entry == NULL) return false; memcpy(entry->name, name, HEXDIR_LEN - 1); entry->name[HEXDIR_LEN - 1] = '\0'; list_add(&entry->list, &ctx->names); return true; } static int nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) { const struct cred *original_cred; struct dentry *dir = nn->rec_file->f_path.dentry; struct nfs4_dir_ctx ctx = { .ctx.actor = nfsd4_build_namelist, .names = LIST_HEAD_INIT(ctx.names) }; struct name_list *entry, *tmp; int status; status = nfs4_save_creds(&original_cred); if (status < 0) return status; status = vfs_llseek(nn->rec_file, 0, SEEK_SET); if (status < 0) { nfs4_reset_creds(original_cred); return status; } status = iterate_dir(nn->rec_file, &ctx.ctx); inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { struct dentry *dentry; dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); break; } status = f(dir, dentry, nn); dput(dentry); } list_del(&entry->list); kfree(entry); } inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { dprintk("NFSD: %s. Left entry %s\n", __func__, entry->name); list_del(&entry->list); kfree(entry); } return status; } static int nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) { struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } status = -ENOENT; if (d_really_is_negative(dentry)) goto out; status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); out: dput(dentry); out_unlock: inode_unlock(d_inode(dir)); return status; } static void __nfsd4_remove_reclaim_record_grace(const char *dname, int len, struct nfsd_net *nn) { struct xdr_netobj name; struct nfs4_client_reclaim *crp; name.data = kmemdup(dname, len, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); return; } name.len = len; crp = nfsd4_find_reclaim_client(name, nn); kfree(name.data); if (crp) nfs4_remove_reclaim_record(crp, nn); } static void nfsd4_remove_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char dname[HEXDIR_LEN]; int status; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return legacy_recdir_name_error(clp, status); status = mnt_want_write_file(nn->rec_file); if (status) goto out; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); status = nfs4_save_creds(&original_cred); if (status < 0) goto out_drop_write; status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn); nfs4_reset_creds(original_cred); if (status == 0) { vfs_fsync(nn->rec_file, 0); if (nn->in_grace) __nfsd4_remove_reclaim_record_grace(dname, HEXDIR_LEN, nn); } out_drop_write: mnt_drop_write_file(nn->rec_file); out: if (status) printk("NFSD: Failed to remove expired client state directory" " %.*s\n", HEXDIR_LEN, dname); } static int purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { int status; struct xdr_netobj name; if (child->d_name.len != HEXDIR_LEN - 1) { printk("%s: illegal name %pd in recovery directory\n", __func__, child); /* Keep trying; maybe the others are OK: */ return 0; } name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); goto out; } name.len = HEXDIR_LEN; if (nfs4_has_reclaimed_state(name, nn)) goto out_free; status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); out_free: kfree(name.data); out: /* Keep trying, success or failure: */ return 0; } static void nfsd4_recdir_purge_old(struct nfsd_net *nn) { int status; nn->in_grace = false; if (!nn->rec_file) return; status = mnt_want_write_file(nn->rec_file); if (status) goto out; status = nfsd4_list_rec_dir(purge_old, nn); if (status == 0) vfs_fsync(nn->rec_file, 0); mnt_drop_write_file(nn->rec_file); out: nfs4_release_reclaim(nn); if (status) printk("nfsd4: failed to purge old clients from recovery" " directory %pD\n", nn->rec_file); } static int load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { struct xdr_netobj name; struct xdr_netobj princhash = { .len = 0, .data = NULL }; if (child->d_name.len != HEXDIR_LEN - 1) { printk("%s: illegal name %pd in recovery directory\n", __func__, child); /* Keep trying; maybe the others are OK: */ return 0; } name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); goto out; } name.len = HEXDIR_LEN; if (!nfs4_client_to_reclaim(name, princhash, nn)) kfree(name.data); out: return 0; } static int nfsd4_recdir_load(struct net *net) { int status; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nn->rec_file) return 0; status = nfsd4_list_rec_dir(load_recdir, nn); if (status) printk("nfsd4: failed loading clients from recovery" " directory %pD\n", nn->rec_file); return status; } /* * Hold reference to the recovery directory. */ static int nfsd4_init_recdir(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); const struct cred *original_cred; int status; printk("NFSD: Using %s as the NFSv4 state recovery directory\n", user_recovery_dirname); BUG_ON(nn->rec_file); status = nfs4_save_creds(&original_cred); if (status < 0) { printk("NFSD: Unable to change credentials to find recovery" " directory: error %d\n", status); return status; } nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); if (IS_ERR(nn->rec_file)) { printk("NFSD: unable to find recovery directory %s\n", user_recovery_dirname); status = PTR_ERR(nn->rec_file); nn->rec_file = NULL; } nfs4_reset_creds(original_cred); if (!status) nn->in_grace = true; return status; } static void nfsd4_shutdown_recdir(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nn->rec_file) return; fput(nn->rec_file); nn->rec_file = NULL; } static int nfs4_legacy_state_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->reclaim_str_hashtbl) return -ENOMEM; for (i = 0; i < CLIENT_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); nn->reclaim_str_hashtbl_size = 0; return 0; } static void nfs4_legacy_state_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); kfree(nn->reclaim_str_hashtbl); } static int nfsd4_load_reboot_recovery_data(struct net *net) { int status; status = nfsd4_init_recdir(net); if (status) return status; status = nfsd4_recdir_load(net); if (status) nfsd4_shutdown_recdir(net); return status; } static int nfsd4_legacy_tracking_init(struct net *net) { int status; /* XXX: The legacy code won't work in a container */ if (net != &init_net) { pr_warn("NFSD: attempt to initialize legacy client tracking in a container ignored.\n"); return -EINVAL; } status = nfs4_legacy_state_init(net); if (status) return status; status = nfsd4_load_reboot_recovery_data(net); if (status) goto err; pr_info("NFSD: Using legacy client tracking operations.\n"); return 0; err: nfs4_legacy_state_shutdown(net); return status; } static void nfsd4_legacy_tracking_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_release_reclaim(nn); nfsd4_shutdown_recdir(net); nfs4_legacy_state_shutdown(net); } /* * Change the NFSv4 recovery directory to recdir. */ int nfs4_reset_recoverydir(char *recdir) { int status; struct path path; status = kern_path(recdir, LOOKUP_FOLLOW, &path); if (status) return status; status = -ENOTDIR; if (d_is_dir(path.dentry)) { strcpy(user_recovery_dirname, recdir); status = 0; } path_put(&path); return status; } char * nfs4_recoverydir(void) { return user_recovery_dirname; } static int nfsd4_check_legacy_client(struct nfs4_client *clp) { int status; char dname[HEXDIR_LEN]; struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct xdr_netobj name; /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) { legacy_recdir_name_error(clp, status); return status; } /* look for it in the reclaim hashtable otherwise */ name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); goto out_enoent; } name.len = HEXDIR_LEN; crp = nfsd4_find_reclaim_client(name, nn); kfree(name.data); if (crp) { set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); crp->cr_clp = clp; return 0; } out_enoent: return -ENOENT; } static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .init = nfsd4_legacy_tracking_init, .exit = nfsd4_legacy_tracking_exit, .create = nfsd4_create_clid_dir, .remove = nfsd4_remove_clid_dir, .check = nfsd4_check_legacy_client, .grace_done = nfsd4_recdir_purge_old, .version = 1, .msglen = 0, }; #endif /* CONFIG_NFSD_LEGACY_CLIENT_TRACKING */ /* Globals */ #define NFSD_PIPE_DIR "nfsd" #define NFSD_CLD_PIPE "cld" /* per-net-ns structure for holding cld upcall info */ struct cld_net { struct rpc_pipe *cn_pipe; spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; struct crypto_shash *cn_tfm; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING bool cn_has_legacy; #endif }; struct cld_upcall { struct list_head cu_list; struct cld_net *cu_net; struct completion cu_done; union { struct cld_msg_hdr cu_hdr; struct cld_msg cu_msg; struct cld_msg_v2 cu_msg_v2; } cu_u; }; static int __cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; struct rpc_pipe_msg msg; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; msg.len = nn->client_tracking_ops->msglen; ret = rpc_queue_upcall(pipe, &msg); if (ret < 0) { goto out; } wait_for_completion(&cup->cu_done); if (msg.errno < 0) ret = msg.errno; out: return ret; } static int cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; /* * -EAGAIN occurs when pipe is closed and reopened while there are * upcalls queued. */ do { ret = __cld_pipe_upcall(pipe, cmsg, nn); } while (ret == -EAGAIN); return ret; } static ssize_t __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, struct nfsd_net *nn) { uint8_t cmd, princhashlen; struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; uint16_t namelen; if (get_user(cmd, &cmsg->cm_cmd)) { dprintk("%s: error when copying cmd from userspace", __func__); return -EFAULT; } if (cmd == Cld_GraceStart) { if (nn->client_tracking_ops->version >= 2) { const struct cld_clntinfo __user *ci; ci = &cmsg->cm_u.cm_clntinfo; if (get_user(namelen, &ci->cc_name.cn_len)) return -EFAULT; name.data = memdup_user(&ci->cc_name.cn_id, namelen); if (IS_ERR(name.data)) return PTR_ERR(name.data); name.len = namelen; get_user(princhashlen, &ci->cc_princhash.cp_len); if (princhashlen > 0) { princhash.data = memdup_user( &ci->cc_princhash.cp_data, princhashlen); if (IS_ERR(princhash.data)) { kfree(name.data); return PTR_ERR(princhash.data); } princhash.len = princhashlen; } else princhash.len = 0; } else { const struct cld_name __user *cnm; cnm = &cmsg->cm_u.cm_name; if (get_user(namelen, &cnm->cn_len)) return -EFAULT; name.data = memdup_user(&cnm->cn_id, namelen); if (IS_ERR(name.data)) return PTR_ERR(name.data); name.len = namelen; } #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { struct cld_net *cn = nn->cld_net; name.len = name.len - 5; memmove(name.data, name.data + 5, name.len); cn->cn_has_legacy = true; } #endif if (!nfs4_client_to_reclaim(name, princhash, nn)) { kfree(name.data); kfree(princhash.data); return -EFAULT; } return nn->client_tracking_ops->msglen; } return -EFAULT; } static ssize_t cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfsd_net_id); struct cld_net *cn = nn->cld_net; int16_t status; if (mlen != nn->client_tracking_ops->msglen) { dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, nn->client_tracking_ops->msglen); return -EINVAL; } /* copy just the xid so we can try to find that */ if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) { dprintk("%s: error when copying xid from userspace", __func__); return -EFAULT; } /* * copy the status so we know whether to remove the upcall from the * list (for -EINPROGRESS, we just want to make sure the xid is * valid, not remove the upcall from the list) */ if (get_user(status, &hdr->cm_status)) { dprintk("%s: error when copying status from userspace", __func__); return -EFAULT; } /* walk the list and find corresponding xid */ cup = NULL; spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) { cup = tmp; if (status != -EINPROGRESS) list_del_init(&cup->cu_list); break; } } spin_unlock(&cn->cn_lock); /* couldn't find upcall? */ if (!cup) { dprintk("%s: couldn't find upcall -- xid=%u\n", __func__, xid); return -EINVAL; } if (status == -EINPROGRESS) return __cld_pipe_inprogress_downcall(cmsg, nn); if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); return mlen; } static void cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct cld_msg *cmsg = msg->data; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u.cu_msg); /* errno >= 0 means we got a downcall */ if (msg->errno >= 0) return; complete(&cup->cu_done); } static const struct rpc_pipe_ops cld_upcall_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = cld_pipe_downcall, .destroy_msg = cld_pipe_destroy_msg, }; static struct dentry * nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { struct dentry *dir, *dentry; dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR); if (dir == NULL) return ERR_PTR(-ENOENT); dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); dput(dir); return dentry; } static void nfsd4_cld_unregister_sb(struct rpc_pipe *pipe) { if (pipe->dentry) rpc_unlink(pipe->dentry); } static struct dentry * nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *sb; struct dentry *dentry; sb = rpc_get_sb_net(net); if (!sb) return NULL; dentry = nfsd4_cld_register_sb(sb, pipe); rpc_put_sb_net(net); return dentry; } static void nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *sb; sb = rpc_get_sb_net(net); if (sb) { nfsd4_cld_unregister_sb(pipe); rpc_put_sb_net(net); } } /* Initialize rpc_pipefs pipe for communication with client tracking daemon */ static int __nfsd4_init_cld_pipe(struct net *net) { int ret; struct dentry *dentry; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn; if (nn->cld_net) return 0; cn = kzalloc(sizeof(*cn), GFP_KERNEL); if (!cn) { ret = -ENOMEM; goto err; } cn->cn_pipe = rpc_mkpipe_data(&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(cn->cn_pipe)) { ret = PTR_ERR(cn->cn_pipe); goto err; } spin_lock_init(&cn->cn_lock); INIT_LIST_HEAD(&cn->cn_list); dentry = nfsd4_cld_register_net(net, cn->cn_pipe); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto err_destroy_data; } cn->cn_pipe->dentry = dentry; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING cn->cn_has_legacy = false; #endif nn->cld_net = cn; return 0; err_destroy_data: rpc_destroy_pipe_data(cn->cn_pipe); err: kfree(cn); printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n", ret); return ret; } static int nfsd4_init_cld_pipe(struct net *net) { int status; status = __nfsd4_init_cld_pipe(net); if (!status) pr_info("NFSD: Using old nfsdcld client tracking operations.\n"); return status; } static void nfsd4_remove_cld_pipe(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn = nn->cld_net; nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); if (cn->cn_tfm) crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } static struct cld_upcall * alloc_cld_upcall(struct nfsd_net *nn) { struct cld_upcall *new, *tmp; struct cld_net *cn = nn->cld_net; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return new; /* FIXME: hard cap on number in flight? */ restart_search: spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) { cn->cn_xid++; spin_unlock(&cn->cn_lock); goto restart_search; } } init_completion(&new->cu_done); new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version; put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid); new->cu_net = cn; list_add(&new->cu_list, &cn->cn_list); spin_unlock(&cn->cn_lock); dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid); return new; } static void free_cld_upcall(struct cld_upcall *victim) { struct cld_net *cn = victim->cu_net; spin_lock(&cn->cn_lock); list_del(&victim->cu_list); spin_unlock(&cn->cn_lock); kfree(victim); } /* Ask daemon to create a new record */ static void nfsd4_cld_create(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if it's already stored */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_Create; cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } free_cld_upcall(cup); out_err: if (ret) printk(KERN_ERR "NFSD: Unable to create client " "record on stable storage: %d\n", ret); } /* Ask daemon to create a new record */ static void nfsd4_cld_create_v2(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; struct cld_msg_v2 *cmsg; struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; /* Don't upcall if it's already stored */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cmsg = &cup->cu_u.cu_msg_v2; cmsg->cm_cmd = Cld_Create; cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len; memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data, clp->cl_name.len); if (clp->cl_cred.cr_raw_principal) principal = clp->cl_cred.cr_raw_principal; else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) { ret = -ENOMEM; goto out; } ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), cksum.data); if (ret) { kfree(cksum.data); goto out; } cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, cksum.data, cksum.len); kfree(cksum.data); } else cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; ret = cld_pipe_upcall(cn->cn_pipe, cmsg, nn); if (!ret) { ret = cmsg->cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } out: free_cld_upcall(cup); out_err: if (ret) pr_err("NFSD: Unable to create client record on stable storage: %d\n", ret); } /* Ask daemon to create a new record */ static void nfsd4_cld_remove(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if it's already removed */ if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_Remove; cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } free_cld_upcall(cup); out_err: if (ret) printk(KERN_ERR "NFSD: Unable to remove client " "record from stable storage: %d\n", ret); } /* * For older nfsdcld's that do not allow us to "slurp" the clients * from the tracking database during startup. * * Check for presence of a record, and update its timestamp */ static int nfsd4_cld_check_v0(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if one was already stored during this grace pd */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; cup = alloc_cld_upcall(nn); if (!cup) { printk(KERN_ERR "NFSD: Unable to check client record on " "stable storage: %d\n", -ENOMEM); return -ENOMEM; } cup->cu_u.cu_msg.cm_cmd = Cld_Check; cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } free_cld_upcall(cup); return ret; } /* * For newer nfsdcld's that allow us to "slurp" the clients * from the tracking database during startup. * * Check for presence of a record in the reclaim_str_hashtbl */ static int nfsd4_cld_check(struct nfs4_client *clp) { struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; /* look for it in the reclaim hashtable otherwise */ crp = nfsd4_find_reclaim_client(clp->cl_name, nn); if (crp) goto found; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (nn->cld_net->cn_has_legacy) { int status; char dname[HEXDIR_LEN]; struct xdr_netobj name; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return -ENOENT; name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); return -ENOENT; } name.len = HEXDIR_LEN; crp = nfsd4_find_reclaim_client(name, nn); kfree(name.data); if (crp) goto found; } #endif return -ENOENT; found: crp->cr_clp = clp; return 0; } static int nfsd4_cld_check_v2(struct nfs4_client *clp) { struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; int status; struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; /* look for it in the reclaim hashtable otherwise */ crp = nfsd4_find_reclaim_client(clp->cl_name, nn); if (crp) goto found; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (cn->cn_has_legacy) { struct xdr_netobj name; char dname[HEXDIR_LEN]; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return -ENOENT; name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data\n", __func__); return -ENOENT; } name.len = HEXDIR_LEN; crp = nfsd4_find_reclaim_client(name, nn); kfree(name.data); if (crp) goto found; } #endif return -ENOENT; found: if (crp->cr_princhash.len) { if (clp->cl_cred.cr_raw_principal) principal = clp->cl_cred.cr_raw_principal; else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) return -ENOENT; status = crypto_shash_tfm_digest(tfm, principal, strlen(principal), cksum.data); if (status) { kfree(cksum.data); return -ENOENT; } if (memcmp(crp->cr_princhash.data, cksum.data, crp->cr_princhash.len)) { kfree(cksum.data); return -ENOENT; } kfree(cksum.data); } crp->cr_clp = clp; return 0; } static int nfsd4_cld_grace_start(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: if (ret) dprintk("%s: Unable to get clients from userspace: %d\n", __func__, ret); return ret; } /* For older nfsdcld's that need cm_gracetime */ static void nfsd4_cld_grace_done_v0(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; cup->cu_u.cu_msg.cm_u.cm_gracetime = nn->boot_time; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: if (ret) printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); } /* * For newer nfsdcld's that do not need cm_gracetime. We also need to call * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl. */ static void nfsd4_cld_grace_done(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: nfs4_release_reclaim(nn); if (ret) printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); } static int nfs4_cld_state_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->reclaim_str_hashtbl) return -ENOMEM; for (i = 0; i < CLIENT_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); nn->reclaim_str_hashtbl_size = 0; nn->track_reclaim_completes = true; atomic_set(&nn->nr_reclaim_complete, 0); return 0; } static void nfs4_cld_state_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nn->track_reclaim_completes = false; kfree(nn->reclaim_str_hashtbl); } static bool cld_running(struct nfsd_net *nn) { struct cld_net *cn = nn->cld_net; struct rpc_pipe *pipe = cn->cn_pipe; return pipe->nreaders || pipe->nwriters; } static int nfsd4_cld_get_version(struct nfsd_net *nn) { int ret = 0; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; uint8_t version; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; if (ret) goto out_free; version = cup->cu_u.cu_msg.cm_u.cm_version; dprintk("%s: userspace returned version %u\n", __func__, version); if (version < 1) version = 1; else if (version > CLD_UPCALL_VERSION) version = CLD_UPCALL_VERSION; switch (version) { case 1: nn->client_tracking_ops = &nfsd4_cld_tracking_ops; break; case 2: nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2; break; default: break; } } out_free: free_cld_upcall(cup); out_err: if (ret) dprintk("%s: Unable to get version from userspace: %d\n", __func__, ret); return ret; } static int nfsd4_cld_tracking_init(struct net *net) { int status; struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool running; int retries = 10; struct crypto_shash *tfm; status = nfs4_cld_state_init(net); if (status) return status; status = __nfsd4_init_cld_pipe(net); if (status) goto err_shutdown; /* * rpc pipe upcalls take 30 seconds to time out, so we don't want to * queue an upcall unless we know that nfsdcld is running (because we * want this to fail fast so that nfsd4_client_tracking_init() can try * the next client tracking method). nfsdcld should already be running * before nfsd is started, so the wait here is for nfsdcld to open the * pipefs file we just created. */ while (!(running = cld_running(nn)) && retries--) msleep(100); if (!running) { status = -ETIMEDOUT; goto err_remove; } tfm = crypto_alloc_shash("sha256", 0, 0); if (IS_ERR(tfm)) { status = PTR_ERR(tfm); goto err_remove; } nn->cld_net->cn_tfm = tfm; status = nfsd4_cld_get_version(nn); if (status == -EOPNOTSUPP) pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n"); status = nfsd4_cld_grace_start(nn); if (status) { if (status == -EOPNOTSUPP) pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n"); nfs4_release_reclaim(nn); goto err_remove; } else pr_info("NFSD: Using nfsdcld client tracking operations.\n"); return 0; err_remove: nfsd4_remove_cld_pipe(net); err_shutdown: nfs4_cld_state_shutdown(net); return status; } static void nfsd4_cld_tracking_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_release_reclaim(nn); nfsd4_remove_cld_pipe(net); nfs4_cld_state_shutdown(net); } /* For older nfsdcld's */ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = { .init = nfsd4_init_cld_pipe, .exit = nfsd4_remove_cld_pipe, .create = nfsd4_cld_create, .remove = nfsd4_cld_remove, .check = nfsd4_cld_check_v0, .grace_done = nfsd4_cld_grace_done_v0, .version = 1, .msglen = sizeof(struct cld_msg), }; /* For newer nfsdcld's */ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .init = nfsd4_cld_tracking_init, .exit = nfsd4_cld_tracking_exit, .create = nfsd4_cld_create, .remove = nfsd4_cld_remove, .check = nfsd4_cld_check, .grace_done = nfsd4_cld_grace_done, .version = 1, .msglen = sizeof(struct cld_msg), }; /* v2 create/check ops include the principal, if available */ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { .init = nfsd4_cld_tracking_init, .exit = nfsd4_cld_tracking_exit, .create = nfsd4_cld_create_v2, .remove = nfsd4_cld_remove, .check = nfsd4_cld_check_v2, .grace_done = nfsd4_cld_grace_done, .version = 2, .msglen = sizeof(struct cld_msg_v2), }; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* upcall via usermodehelper */ static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), S_IRUGO|S_IWUSR); MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program"); static bool cltrack_legacy_disable; module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(cltrack_legacy_disable, "Disable legacy recoverydir conversion. Default: false"); #define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" #define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" #define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION=" #define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START=" static char * nfsd4_cltrack_legacy_topdir(void) { int copied; size_t len; char *result; if (cltrack_legacy_disable) return NULL; len = strlen(LEGACY_TOPDIR_ENV_PREFIX) + strlen(nfs4_recoverydir()) + 1; result = kmalloc(len, GFP_KERNEL); if (!result) return result; copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s", nfs4_recoverydir()); if (copied >= len) { /* just return nothing if output was truncated */ kfree(result); return NULL; } return result; } static char * nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) { int copied; size_t len; char *result; if (cltrack_legacy_disable) return NULL; /* +1 is for '/' between "topdir" and "recdir" */ len = strlen(LEGACY_RECDIR_ENV_PREFIX) + strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN; result = kmalloc(len, GFP_KERNEL); if (!result) return result; copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/", nfs4_recoverydir()); if (copied > (len - HEXDIR_LEN)) { /* just return nothing if output will be truncated */ kfree(result); return NULL; } copied = nfs4_make_rec_clidname(result + copied, name); if (copied) { kfree(result); return NULL; } return result; } static char * nfsd4_cltrack_client_has_session(struct nfs4_client *clp) { int copied; size_t len; char *result; /* prefix + Y/N character + terminating NULL */ len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1; result = kmalloc(len, GFP_KERNEL); if (!result) return result; copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c", clp->cl_minorversion ? 'Y' : 'N'); if (copied >= len) { /* just return nothing if output was truncated */ kfree(result); return NULL; } return result; } static char * nfsd4_cltrack_grace_start(time64_t grace_start) { int copied; size_t len; char *result; /* prefix + max width of int64_t string + terminating NULL */ len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1; result = kmalloc(len, GFP_KERNEL); if (!result) return result; copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%lld", grace_start); if (copied >= len) { /* just return nothing if output was truncated */ kfree(result); return NULL; } return result; } static int nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) { char *envp[3]; char *argv[4]; int ret; if (unlikely(!cltrack_prog[0])) { dprintk("%s: cltrack_prog is disabled\n", __func__); return -EACCES; } dprintk("%s: cmd: %s\n", __func__, cmd); dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)"); dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)"); envp[0] = env0; envp[1] = env1; envp[2] = NULL; argv[0] = (char *)cltrack_prog; argv[1] = cmd; argv[2] = arg; argv[3] = NULL; ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); /* * Disable the upcall mechanism if we're getting an ENOENT or EACCES * error. The admin can re-enable it on the fly by using sysfs * once the problem has been fixed. */ if (ret == -ENOENT || ret == -EACCES) { dprintk("NFSD: %s was not found or isn't executable (%d). " "Setting cltrack_prog to blank string!", cltrack_prog, ret); cltrack_prog[0] = '\0'; } dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret); return ret; } static char * bin_to_hex_dup(const unsigned char *src, int srclen) { char *buf; /* +1 for terminating NULL */ buf = kzalloc((srclen * 2) + 1, GFP_KERNEL); if (!buf) return buf; bin2hex(buf, src, srclen); return buf; } static int nfsd4_umh_cltrack_init(struct net *net) { int ret; struct nfsd_net *nn = net_generic(net, nfsd_net_id); char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time); /* XXX: The usermode helper s not working in container yet. */ if (net != &init_net) { pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n"); kfree(grace_start); return -EINVAL; } ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); kfree(grace_start); if (!ret) pr_info("NFSD: Using UMH upcall client tracking operations.\n"); return ret; } static void nfsd4_cltrack_upcall_lock(struct nfs4_client *clp) { wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK, TASK_UNINTERRUPTIBLE); } static void nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp) { smp_mb__before_atomic(); clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags); smp_mb__after_atomic(); wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK); } static void nfsd4_umh_cltrack_create(struct nfs4_client *clp) { char *hexid, *has_session, *grace_start; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); /* * With v4.0 clients, there's little difference in outcome between a * create and check operation, and we can end up calling into this * function multiple times per client (once for each openowner). So, * for v4.0 clients skip upcalling once the client has been recorded * on stable storage. * * For v4.1+ clients, the outcome of the two operations is different, * so we must ensure that we upcall for the create operation. v4.1+ * clients call this on RECLAIM_COMPLETE though, so we should only end * up doing a single create upcall per client. */ if (clp->cl_minorversion == 0 && test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } has_session = nfsd4_cltrack_client_has_session(clp); grace_start = nfsd4_cltrack_grace_start(nn->boot_time); nfsd4_cltrack_upcall_lock(clp); if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start)) set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); nfsd4_cltrack_upcall_unlock(clp); kfree(has_session); kfree(grace_start); kfree(hexid); } static void nfsd4_umh_cltrack_remove(struct nfs4_client *clp) { char *hexid; if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } nfsd4_cltrack_upcall_lock(clp); if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) && nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0) clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); nfsd4_cltrack_upcall_unlock(clp); kfree(hexid); } static int nfsd4_umh_cltrack_check(struct nfs4_client *clp) { int ret; char *hexid, *has_session, *legacy; if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return -ENOMEM; } has_session = nfsd4_cltrack_client_has_session(clp); legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); nfsd4_cltrack_upcall_lock(clp); if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) { ret = 0; } else { ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy); if (ret == 0) set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } nfsd4_cltrack_upcall_unlock(clp); kfree(has_session); kfree(legacy); kfree(hexid); return ret; } static void nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) { char *legacy; char timestr[22]; /* FIXME: better way to determine max size? */ sprintf(timestr, "%lld", nn->boot_time); legacy = nfsd4_cltrack_legacy_topdir(); nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL); kfree(legacy); } static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .init = nfsd4_umh_cltrack_init, .exit = NULL, .create = nfsd4_umh_cltrack_create, .remove = nfsd4_umh_cltrack_remove, .check = nfsd4_umh_cltrack_check, .grace_done = nfsd4_umh_cltrack_grace_done, .version = 1, .msglen = 0, }; static inline int check_for_legacy_methods(int status, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct path path; /* * Next, try the UMH upcall. */ nn->client_tracking_ops = &nfsd4_umh_tracking_ops; status = nn->client_tracking_ops->init(net); if (!status) return status; /* * Finally, See if the recoverydir exists and is a directory. * If it is, then use the legacy ops. */ nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); if (!status) { status = !d_is_dir(path.dentry); path_put(&path); if (status) return -ENOTDIR; status = nn->client_tracking_ops->init(net); } return status; } #else static inline int check_for_legacy_methods(int status, struct net *net) { return status; } #endif /* CONFIG_LEGACY_NFSD_CLIENT_TRACKING */ int nfsd4_client_tracking_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int status; /* just run the init if it the method is already decided */ if (nn->client_tracking_ops) goto do_init; /* First, try to use nfsdcld */ nn->client_tracking_ops = &nfsd4_cld_tracking_ops; status = nn->client_tracking_ops->init(net); if (!status) return status; if (status != -ETIMEDOUT) { nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0; status = nn->client_tracking_ops->init(net); if (!status) return status; } status = check_for_legacy_methods(status, net); if (status) goto out; do_init: status = nn->client_tracking_ops->init(net); out: if (status) { pr_warn("NFSD: Unable to initialize client recovery tracking! (%d)\n", status); pr_warn("NFSD: Is nfsdcld running? If not, enable CONFIG_NFSD_LEGACY_CLIENT_TRACKING.\n"); nn->client_tracking_ops = NULL; } return status; } void nfsd4_client_tracking_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->client_tracking_ops) { if (nn->client_tracking_ops->exit) nn->client_tracking_ops->exit(net); nn->client_tracking_ops = NULL; } } void nfsd4_client_record_create(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (nn->client_tracking_ops) nn->client_tracking_ops->create(clp); } void nfsd4_client_record_remove(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (nn->client_tracking_ops) nn->client_tracking_ops->remove(clp); } int nfsd4_client_record_check(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (nn->client_tracking_ops) return nn->client_tracking_ops->check(clp); return -EOPNOTSUPP; } void nfsd4_record_grace_done(struct nfsd_net *nn) { if (nn->client_tracking_ops) nn->client_tracking_ops->grace_done(nn); } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn = nn->cld_net; struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) return 0; if (!cn) { module_put(THIS_MODULE); return 0; } switch (event) { case RPC_PIPEFS_MOUNT: dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); break; } cn->cn_pipe->dentry = dentry; break; case RPC_PIPEFS_UMOUNT: if (cn->cn_pipe->dentry) nfsd4_cld_unregister_sb(cn->cn_pipe); break; default: ret = -ENOTSUPP; break; } module_put(THIS_MODULE); return ret; } static struct notifier_block nfsd4_cld_block = { .notifier_call = rpc_pipefs_event, }; int register_cld_notifier(void) { WARN_ON(!nfsd_net_id); return rpc_pipefs_notifier_register(&nfsd4_cld_block); } void unregister_cld_notifier(void) { rpc_pipefs_notifier_unregister(&nfsd4_cld_block); }
47 44 43 34 34 48 44 235 235 235 3 3 8 8 8 8 142 138 138 97 134 33 15 82 29 82 78 1 8 4 2 58 58 58 34 34 33 1 1 57 19 16 16 18 1 18 29 4 3 38 38 45 30 30 4 129 79 2 142 12 11 11 5 8 2 2 6 1 1 6 12 137 136 137 129 1 128 128 8 4 120 2 118 5 1 115 25 115 2 113 113 7 15 15 24 137 137 28 27 28 6 1 6 1 6 1 6 1 6 2 6 3 6 2 6 6 2 6 1 6 51 2 2 2 9 9 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ #include <linux/fs.h> #include <linux/module.h> #include <linux/parser.h> #include <linux/completion.h> #include <linux/vfs.h> #include <linux/quotaops.h> #include <linux/mount.h> #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/posix_acl.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/crc32.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_inode.h" #include "jfs_metapage.h" #include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_imap.h" #include "jfs_acl.h" #include "jfs_debug.h" #include "jfs_xattr.h" #include "jfs_dinode.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); MODULE_LICENSE("GPL"); static struct kmem_cache *jfs_inode_cachep; static const struct super_operations jfs_super_operations; static const struct export_operations jfs_export_operations; static struct file_system_type jfs_fs_type; #define MAX_COMMIT_THREADS 64 static int commit_threads; module_param(commit_threads, int, 0); MODULE_PARM_DESC(commit_threads, "Number of commit threads"); static struct task_struct *jfsCommitThread[MAX_COMMIT_THREADS]; struct task_struct *jfsIOthread; struct task_struct *jfsSyncThread; #ifdef CONFIG_JFS_DEBUG int jfsloglevel = JFS_LOGLEVEL_WARN; module_param(jfsloglevel, int, 0644); MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); #endif static void jfs_handle_error(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); if (sb_rdonly(sb)) return; updateSuper(sb, FM_DIRTY); if (sbi->flag & JFS_ERR_PANIC) panic("JFS (device %s): panic forced after error\n", sb->s_id); else if (sbi->flag & JFS_ERR_REMOUNT_RO) { jfs_err("ERROR: (device %s): remounting filesystem as read-only", sb->s_id); sb->s_flags |= SB_RDONLY; } /* nothing is done for continue beyond marking the superblock dirty */ } void jfs_error(struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("ERROR: (device %s): %ps: %pV\n", sb->s_id, __builtin_return_address(0), &vaf); va_end(args); jfs_handle_error(sb); } static struct inode *jfs_alloc_inode(struct super_block *sb) { struct jfs_inode_info *jfs_inode; jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS); if (!jfs_inode) return NULL; #ifdef CONFIG_QUOTA memset(&jfs_inode->i_dquot, 0, sizeof(jfs_inode->i_dquot)); #endif return &jfs_inode->vfs_inode; } static void jfs_free_inode(struct inode *inode) { kmem_cache_free(jfs_inode_cachep, JFS_IP(inode)); } static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); s64 maxinodes; struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap; jfs_info("In jfs_statfs"); buf->f_type = JFS_SUPER_MAGIC; buf->f_bsize = sbi->bsize; buf->f_blocks = sbi->bmap->db_mapsize; buf->f_bfree = sbi->bmap->db_nfree; buf->f_bavail = sbi->bmap->db_nfree; /* * If we really return the number of allocated & free inodes, some * applications will fail because they won't see enough free inodes. * We'll try to calculate some guess as to how many inodes we can * really allocate * * buf->f_files = atomic_read(&imap->im_numinos); * buf->f_ffree = atomic_read(&imap->im_numfree); */ maxinodes = min((s64) atomic_read(&imap->im_numinos) + ((sbi->bmap->db_nfree >> imap->im_l2nbperiext) << L2INOSPEREXT), (s64) 0xffffffffLL); buf->f_files = maxinodes; buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - atomic_read(&imap->im_numfree)); buf->f_fsid.val[0] = crc32_le(0, (char *)&sbi->uuid, sizeof(sbi->uuid)/2); buf->f_fsid.val[1] = crc32_le(0, (char *)&sbi->uuid + sizeof(sbi->uuid)/2, sizeof(sbi->uuid)/2); buf->f_namelen = JFS_NAME_MAX; return 0; } #ifdef CONFIG_QUOTA static int jfs_quota_off(struct super_block *sb, int type); static int jfs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static void jfs_quota_off_umount(struct super_block *sb) { int type; for (type = 0; type < MAXQUOTAS; type++) jfs_quota_off(sb, type); } static const struct quotactl_ops jfs_quotactl_ops = { .quota_on = jfs_quota_on, .quota_off = jfs_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #else static inline void jfs_quota_off_umount(struct super_block *sb) { } #endif static void jfs_put_super(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); int rc; jfs_info("In jfs_put_super"); jfs_quota_off_umount(sb); rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); unload_nls(sbi->nls_tab); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); iput(sbi->direct_inode); kfree(sbi); } enum { Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask, Opt_discard, Opt_nodiscard, Opt_discard_minblk }; static const match_table_t tokens = { {Opt_integrity, "integrity"}, {Opt_nointegrity, "nointegrity"}, {Opt_iocharset, "iocharset=%s"}, {Opt_resize, "resize=%u"}, {Opt_resize_nosize, "resize"}, {Opt_errors, "errors=%s"}, {Opt_ignore, "noquota"}, {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_umask, "umask=%u"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_discard_minblk, "discard=%u"}, {Opt_err, NULL} }; static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, int *flag) { void *nls_map = (void *)-1; /* -1: no change; NULL: none */ char *p; struct jfs_sb_info *sbi = JFS_SBI(sb); *newLVSize = 0; if (!options) return 1; while ((p = strsep(&options, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_integrity: *flag &= ~JFS_NOINTEGRITY; break; case Opt_nointegrity: *flag |= JFS_NOINTEGRITY; break; case Opt_ignore: /* Silently ignore the quota options */ /* Don't do anything ;-) */ break; case Opt_iocharset: if (nls_map && nls_map != (void *) -1) unload_nls(nls_map); if (!strcmp(args[0].from, "none")) nls_map = NULL; else { nls_map = load_nls(args[0].from); if (!nls_map) { pr_err("JFS: charset not found\n"); goto cleanup; } } break; case Opt_resize: { char *resize = args[0].from; int rc = kstrtoll(resize, 0, newLVSize); if (rc) goto cleanup; break; } case Opt_resize_nosize: { *newLVSize = sb_bdev_nr_blocks(sb); if (*newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); break; } case Opt_errors: { char *errors = args[0].from; if (!errors || !*errors) goto cleanup; if (!strcmp(errors, "continue")) { *flag &= ~JFS_ERR_REMOUNT_RO; *flag &= ~JFS_ERR_PANIC; *flag |= JFS_ERR_CONTINUE; } else if (!strcmp(errors, "remount-ro")) { *flag &= ~JFS_ERR_CONTINUE; *flag &= ~JFS_ERR_PANIC; *flag |= JFS_ERR_REMOUNT_RO; } else if (!strcmp(errors, "panic")) { *flag &= ~JFS_ERR_CONTINUE; *flag &= ~JFS_ERR_REMOUNT_RO; *flag |= JFS_ERR_PANIC; } else { pr_err("JFS: %s is an invalid error handler\n", errors); goto cleanup; } break; } #ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: *flag |= JFS_USRQUOTA; break; case Opt_grpquota: *flag |= JFS_GRPQUOTA; break; #else case Opt_usrquota: case Opt_grpquota: case Opt_quota: pr_err("JFS: quota operations not supported\n"); break; #endif case Opt_uid: { char *uid = args[0].from; uid_t val; int rc = kstrtouint(uid, 0, &val); if (rc) goto cleanup; sbi->uid = make_kuid(current_user_ns(), val); if (!uid_valid(sbi->uid)) goto cleanup; break; } case Opt_gid: { char *gid = args[0].from; gid_t val; int rc = kstrtouint(gid, 0, &val); if (rc) goto cleanup; sbi->gid = make_kgid(current_user_ns(), val); if (!gid_valid(sbi->gid)) goto cleanup; break; } case Opt_umask: { char *umask = args[0].from; int rc = kstrtouint(umask, 8, &sbi->umask); if (rc) goto cleanup; if (sbi->umask & ~0777) { pr_err("JFS: Invalid value of umask\n"); goto cleanup; } break; } case Opt_discard: /* if set to 1, even copying files will cause * trimming :O * -> user has more control over the online trimming */ sbi->minblks_trim = 64; if (bdev_max_discard_sectors(sb->s_bdev)) *flag |= JFS_DISCARD; else pr_err("JFS: discard option not supported on device\n"); break; case Opt_nodiscard: *flag &= ~JFS_DISCARD; break; case Opt_discard_minblk: { char *minblks_trim = args[0].from; int rc; if (bdev_max_discard_sectors(sb->s_bdev)) { *flag |= JFS_DISCARD; rc = kstrtouint(minblks_trim, 0, &sbi->minblks_trim); if (rc) goto cleanup; } else pr_err("JFS: discard option not supported on device\n"); break; } default: printk("jfs: Unrecognized mount option \"%s\" or missing value\n", p); goto cleanup; } } if (nls_map != (void *) -1) { /* Discard old (if remount) */ unload_nls(sbi->nls_tab); sbi->nls_tab = nls_map; } return 1; cleanup: if (nls_map && nls_map != (void *) -1) unload_nls(nls_map); return 0; } static int jfs_remount(struct super_block *sb, int *flags, char *data) { s64 newLVSize = 0; int rc = 0; int flag = JFS_SBI(sb)->flag; int ret; sync_filesystem(sb); if (!parse_options(data, sb, &newLVSize, &flag)) return -EINVAL; if (newLVSize) { if (sb_rdonly(sb)) { pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } rc = jfs_extendfs(sb, newLVSize, 0); if (rc) return rc; } if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o */ truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); JFS_SBI(sb)->flag = flag; ret = jfs_mount_rw(sb, 1); /* mark the fs r/w for quota activity */ sb->s_flags &= ~SB_RDONLY; dquot_resume(sb, -1); return ret; } if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; return rc; } if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) if (!sb_rdonly(sb)) { rc = jfs_umount_rw(sb); if (rc) return rc; JFS_SBI(sb)->flag = flag; ret = jfs_mount_rw(sb, 1); return ret; } JFS_SBI(sb)->flag = flag; return 0; } static int jfs_fill_super(struct super_block *sb, void *data, int silent) { struct jfs_sb_info *sbi; struct inode *inode; int rc; s64 newLVSize = 0; int flag, ret = -EINVAL; jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; sb->s_max_links = JFS_LINK_MAX; sb->s_time_min = 0; sb->s_time_max = U32_MAX; sbi->sb = sb; sbi->uid = INVALID_UID; sbi->gid = INVALID_GID; sbi->umask = -1; /* initialize the mount flag and determine the default error handler */ flag = JFS_ERR_REMOUNT_RO; if (!parse_options((char *) data, sb, &newLVSize, &flag)) goto out_kfree; sbi->flag = flag; #ifdef CONFIG_JFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif if (newLVSize) { pr_err("resize option for remount only\n"); goto out_kfree; } /* * Initialize blocksize to 4K. */ sb_set_blocksize(sb, PSIZE); /* * Set method vectors. */ sb->s_op = &jfs_super_operations; sb->s_export_op = &jfs_export_operations; sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; sb->s_qcop = &jfs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif /* * Initialize direct-mapping inode/address-space */ inode = new_inode(sb); if (inode == NULL) { ret = -ENOMEM; goto out_unload; } inode->i_size = bdev_nr_bytes(sb->s_bdev); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); sbi->direct_inode = inode; rc = jfs_mount(sb); if (rc) { if (!silent) jfs_err("jfs_mount failed w/return code = %d", rc); goto out_mount_failed; } if (sb_rdonly(sb)) sbi->log = NULL; else { rc = jfs_mount_rw(sb, 0); if (rc) { if (!silent) { jfs_err("jfs_mount_rw failed, return code = %d", rc); } goto out_no_rw; } } sb->s_magic = JFS_SUPER_MAGIC; if (sbi->mntflag & JFS_OS2) sb->s_d_op = &jfs_ci_dentry_operations; inode = jfs_iget(sb, ROOT_I); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out_no_rw; } sb->s_root = d_make_root(inode); if (!sb->s_root) goto out_no_root; /* logical blocks are represented by 40 bits in pxd_t, etc. * and page cache is indexed by long */ sb->s_maxbytes = min(((loff_t)sb->s_blocksize) << 40, MAX_LFS_FILESIZE); sb->s_time_gran = 1; return 0; out_no_root: jfs_err("jfs_read_super: get root dentry failed"); out_no_rw: rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); out_mount_failed: filemap_write_and_wait(sbi->direct_inode->i_mapping); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); make_bad_inode(sbi->direct_inode); iput(sbi->direct_inode); sbi->direct_inode = NULL; out_unload: unload_nls(sbi->nls_tab); out_kfree: kfree(sbi); return ret; } static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; int rc = 0; if (!sb_rdonly(sb)) { txQuiesce(sb); rc = lmLogShutdown(log); if (rc) { jfs_error(sb, "lmLogShutdown failed\n"); /* let operations fail rather than hang */ txResume(sb); return rc; } rc = updateSuper(sb, FM_CLEAN); if (rc) { jfs_err("jfs_freeze: updateSuper failed"); /* * Don't fail here. Everything succeeded except * marking the superblock clean, so there's really * no harm in leaving it frozen for now. */ } } return 0; } static int jfs_unfreeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; int rc = 0; if (!sb_rdonly(sb)) { rc = updateSuper(sb, FM_MOUNT); if (rc) { jfs_error(sb, "updateSuper failed\n"); goto out; } rc = lmLogInit(log); if (rc) jfs_error(sb, "lmLogInit failed\n"); out: txResume(sb); } return rc; } static struct dentry *jfs_do_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); } static int jfs_sync_fs(struct super_block *sb, int wait) { struct jfs_log *log = JFS_SBI(sb)->log; /* log == NULL indicates read-only mount */ if (log) { /* * Write quota structures to quota file, sync_blockdev() will * write them to disk later */ dquot_writeback_dquots(sb, -1); jfs_flush_journal(log, wait); jfs_syncpt(log, 0); } return 0; } static int jfs_show_options(struct seq_file *seq, struct dentry *root) { struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); if (uid_valid(sbi->uid)) seq_printf(seq, ",uid=%d", from_kuid(&init_user_ns, sbi->uid)); if (gid_valid(sbi->gid)) seq_printf(seq, ",gid=%d", from_kgid(&init_user_ns, sbi->gid)); if (sbi->umask != -1) seq_printf(seq, ",umask=%03o", sbi->umask); if (sbi->flag & JFS_NOINTEGRITY) seq_puts(seq, ",nointegrity"); if (sbi->flag & JFS_DISCARD) seq_printf(seq, ",discard=%u", sbi->minblks_trim); if (sbi->nls_tab) seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); if (sbi->flag & JFS_ERR_CONTINUE) seq_printf(seq, ",errors=continue"); if (sbi->flag & JFS_ERR_PANIC) seq_printf(seq, ",errors=panic"); #ifdef CONFIG_QUOTA if (sbi->flag & JFS_USRQUOTA) seq_puts(seq, ",usrquota"); if (sbi->flag & JFS_GRPQUOTA) seq_puts(seq, ",grpquota"); #endif return 0; } #ifdef CONFIG_QUOTA /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; sector_t blk = off >> sb->s_blocksize_bits; int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; struct buffer_head tmp_bh; struct buffer_head *bh; loff_t i_size = i_size_read(inode); if (off > i_size) return 0; if (off+len > i_size) len = i_size-off; toread = len; while (toread > 0) { tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 0); if (err) return err; if (!buffer_mapped(&tmp_bh)) /* A hole? */ memset(data, 0, tocopy); else { bh = sb_bread(sb, tmp_bh.b_blocknr); if (!bh) return -EIO; memcpy(data, bh->b_data+offset, tocopy); brelse(bh); } offset = 0; toread -= tocopy; data += tocopy; blk++; } return len; } /* Write to quotafile */ static ssize_t jfs_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; sector_t blk = off >> sb->s_blocksize_bits; int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t towrite = len; struct buffer_head tmp_bh; struct buffer_head *bh; inode_lock(inode); while (towrite > 0) { tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 1); if (err) goto out; if (offset || tocopy != sb->s_blocksize) bh = sb_bread(sb, tmp_bh.b_blocknr); else bh = sb_getblk(sb, tmp_bh.b_blocknr); if (!bh) { err = -EIO; goto out; } lock_buffer(bh); memcpy(bh->b_data+offset, data, tocopy); flush_dcache_page(bh->b_page); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); brelse(bh); offset = 0; towrite -= tocopy; data += tocopy; blk++; } out: if (len == towrite) { inode_unlock(inode); return err; } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); inode_unlock(inode); return len - towrite; } static struct dquot __rcu **jfs_get_dquots(struct inode *inode) { return JFS_IP(inode)->i_dquot; } static int jfs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int err; struct inode *inode; err = dquot_quota_on(sb, type, format_id, path); if (err) return err; inode = d_inode(path->dentry); inode_lock(inode); JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); mark_inode_dirty(inode); return 0; } static int jfs_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; int err; if (!inode || !igrab(inode)) goto out; err = dquot_quota_off(sb, type); if (err) goto out_put; inode_lock(inode); JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); mark_inode_dirty(inode); out_put: iput(inode); return err; out: return dquot_quota_off(sb, type); } #endif static const struct super_operations jfs_super_operations = { .alloc_inode = jfs_alloc_inode, .free_inode = jfs_free_inode, .dirty_inode = jfs_dirty_inode, .write_inode = jfs_write_inode, .evict_inode = jfs_evict_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, .freeze_fs = jfs_freeze, .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, .remount_fs = jfs_remount, .show_options = jfs_show_options, #ifdef CONFIG_QUOTA .quota_read = jfs_quota_read, .quota_write = jfs_quota_write, .get_dquots = jfs_get_dquots, #endif }; static const struct export_operations jfs_export_operations = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = jfs_fh_to_dentry, .fh_to_parent = jfs_fh_to_parent, .get_parent = jfs_get_parent, }; static struct file_system_type jfs_fs_type = { .owner = THIS_MODULE, .name = "jfs", .mount = jfs_do_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("jfs"); static void init_once(void *foo) { struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; memset(jfs_ip, 0, sizeof(struct jfs_inode_info)); INIT_LIST_HEAD(&jfs_ip->anon_inode_list); init_rwsem(&jfs_ip->rdwrlock); mutex_init(&jfs_ip->commit_mutex); init_rwsem(&jfs_ip->xattr_sem); spin_lock_init(&jfs_ip->ag_lock); jfs_ip->active_ag = -1; inode_init_once(&jfs_ip->vfs_inode); } static int __init init_jfs_fs(void) { int i; int rc; jfs_inode_cachep = kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, offsetof(struct jfs_inode_info, i_inline_all), sizeof_field(struct jfs_inode_info, i_inline_all), init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; /* * Metapage initialization */ rc = metapage_init(); if (rc) { jfs_err("metapage_init failed w/rc = %d", rc); goto free_slab; } /* * Transaction Manager initialization */ rc = txInit(); if (rc) { jfs_err("txInit failed w/rc = %d", rc); goto free_metapage; } /* * I/O completion thread (endio) */ jfsIOthread = kthread_run(jfsIOWait, NULL, "jfsIO"); if (IS_ERR(jfsIOthread)) { rc = PTR_ERR(jfsIOthread); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); goto end_txmngr; } if (commit_threads < 1) commit_threads = num_online_cpus(); if (commit_threads > MAX_COMMIT_THREADS) commit_threads = MAX_COMMIT_THREADS; for (i = 0; i < commit_threads; i++) { jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit"); if (IS_ERR(jfsCommitThread[i])) { rc = PTR_ERR(jfsCommitThread[i]); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); commit_threads = i; goto kill_committask; } } jfsSyncThread = kthread_run(jfs_sync, NULL, "jfsSync"); if (IS_ERR(jfsSyncThread)) { rc = PTR_ERR(jfsSyncThread); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); goto kill_committask; } #ifdef PROC_FS_JFS jfs_proc_init(); #endif rc = register_filesystem(&jfs_fs_type); if (!rc) return 0; #ifdef PROC_FS_JFS jfs_proc_clean(); #endif kthread_stop(jfsSyncThread); kill_committask: for (i = 0; i < commit_threads; i++) kthread_stop(jfsCommitThread[i]); kthread_stop(jfsIOthread); end_txmngr: txExit(); free_metapage: metapage_exit(); free_slab: kmem_cache_destroy(jfs_inode_cachep); return rc; } static void __exit exit_jfs_fs(void) { int i; jfs_info("exit_jfs_fs called"); txExit(); metapage_exit(); kthread_stop(jfsIOthread); for (i = 0; i < commit_threads; i++) kthread_stop(jfsCommitThread[i]); kthread_stop(jfsSyncThread); #ifdef PROC_FS_JFS jfs_proc_clean(); #endif unregister_filesystem(&jfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(jfs_inode_cachep); } module_init(init_jfs_fs) module_exit(exit_jfs_fs)
8 8 7 8 8 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 3 8 8 8 8 8 8 8 8 8 3 2 2 2 2 1 1 5 7 4 7 4 6 6 2 6 3 2 1 2 3 1 2 3 2 1 2 3 1 2 1 1 2 3 1 1 1 1 12 11 10 10 5 3 8 8 2 2 4 17 17 3 2 2 1 12 12 4 17 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 /* * Copyright (C) 2017 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/rtnetlink.h> #include <net/pkt_cls.h> #include "netdevsim.h" #define pr_vlog(env, fmt, ...) \ bpf_verifier_log_write(env, "[netdevsim] " fmt, ##__VA_ARGS__) struct nsim_bpf_bound_prog { struct nsim_dev *nsim_dev; struct bpf_prog *prog; struct dentry *ddir; const char *state; bool is_loaded; struct list_head l; }; #define NSIM_BPF_MAX_KEYS 2 struct nsim_bpf_bound_map { struct netdevsim *ns; struct bpf_offloaded_map *map; struct mutex mutex; struct nsim_map_entry { void *key; void *value; } entry[NSIM_BPF_MAX_KEYS]; struct list_head l; }; static int nsim_bpf_string_show(struct seq_file *file, void *data) { const char **str = file->private; if (*str) seq_printf(file, "%s\n", *str); return 0; } DEFINE_SHOW_ATTRIBUTE(nsim_bpf_string); static int nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) { struct nsim_bpf_bound_prog *state; int ret = 0; state = env->prog->aux->offload->dev_priv; if (state->nsim_dev->bpf_bind_verifier_delay && !insn_idx) msleep(state->nsim_dev->bpf_bind_verifier_delay); if (insn_idx == env->prog->len - 1) { pr_vlog(env, "Hello from netdevsim!\n"); if (!state->nsim_dev->bpf_bind_verifier_accept) ret = -EOPNOTSUPP; } return ret; } static int nsim_bpf_finalize(struct bpf_verifier_env *env) { return 0; } static bool nsim_xdp_offload_active(struct netdevsim *ns) { return ns->xdp_hw.prog; } static void nsim_prog_set_loaded(struct bpf_prog *prog, bool loaded) { struct nsim_bpf_bound_prog *state; if (!prog || !bpf_prog_is_offloaded(prog->aux)) return; state = prog->aux->offload->dev_priv; state->is_loaded = loaded; } static int nsim_bpf_offload(struct netdevsim *ns, struct bpf_prog *prog, bool oldprog) { nsim_prog_set_loaded(ns->bpf_offloaded, false); WARN(!!ns->bpf_offloaded != oldprog, "bad offload state, expected offload %sto be active", oldprog ? "" : "not "); ns->bpf_offloaded = prog; ns->bpf_offloaded_id = prog ? prog->aux->id : 0; nsim_prog_set_loaded(prog, true); return 0; } int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { struct tc_cls_bpf_offload *cls_bpf = type_data; struct bpf_prog *prog = cls_bpf->prog; struct netdevsim *ns = cb_priv; struct bpf_prog *oldprog; if (type != TC_SETUP_CLSBPF) { NSIM_EA(cls_bpf->common.extack, "only offload of BPF classifiers supported"); return -EOPNOTSUPP; } if (!tc_cls_can_offload_and_chain0(ns->netdev, &cls_bpf->common)) return -EOPNOTSUPP; if (cls_bpf->common.protocol != htons(ETH_P_ALL)) { NSIM_EA(cls_bpf->common.extack, "only ETH_P_ALL supported as filter protocol"); return -EOPNOTSUPP; } if (!ns->bpf_tc_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject BPF TC offload"); return -EOPNOTSUPP; } /* Note: progs without skip_sw will probably not be dev bound */ if (prog && !prog->aux->offload && !ns->bpf_tc_non_bound_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject unbound programs"); return -EOPNOTSUPP; } if (cls_bpf->command != TC_CLSBPF_OFFLOAD) return -EOPNOTSUPP; oldprog = cls_bpf->oldprog; /* Don't remove if oldprog doesn't match driver's state */ if (ns->bpf_offloaded != oldprog) { oldprog = NULL; if (!cls_bpf->prog) return 0; if (ns->bpf_offloaded) { NSIM_EA(cls_bpf->common.extack, "driver and netdev offload states mismatch"); return -EBUSY; } } return nsim_bpf_offload(ns, cls_bpf->prog, oldprog); } int nsim_bpf_disable_tc(struct netdevsim *ns) { if (ns->bpf_offloaded && !nsim_xdp_offload_active(ns)) return -EBUSY; return 0; } static int nsim_xdp_offload_prog(struct netdevsim *ns, struct netdev_bpf *bpf) { if (!nsim_xdp_offload_active(ns) && !bpf->prog) return 0; if (!nsim_xdp_offload_active(ns) && bpf->prog && ns->bpf_offloaded) { NSIM_EA(bpf->extack, "TC program is already loaded"); return -EBUSY; } return nsim_bpf_offload(ns, bpf->prog, nsim_xdp_offload_active(ns)); } static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf, struct xdp_attachment_info *xdp) { int err; if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW && !ns->bpf_xdpoffload_accept) { NSIM_EA(bpf->extack, "XDP offload disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW) { err = nsim_xdp_offload_prog(ns, bpf); if (err) return err; } xdp_attachment_setup(xdp, bpf); return 0; } static int nsim_bpf_create_prog(struct nsim_dev *nsim_dev, struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; char name[16]; int ret; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return -ENOMEM; state->nsim_dev = nsim_dev; state->prog = prog; state->state = "verify"; /* Program id is not populated yet when we create the state. */ sprintf(name, "%u", nsim_dev->prog_id_gen++); state->ddir = debugfs_create_dir(name, nsim_dev->ddir_bpf_bound_progs); if (IS_ERR(state->ddir)) { ret = PTR_ERR(state->ddir); kfree(state); return ret; } debugfs_create_u32("id", 0400, state->ddir, &prog->aux->id); debugfs_create_file("state", 0400, state->ddir, &state->state, &nsim_bpf_string_fops); debugfs_create_bool("loaded", 0400, state->ddir, &state->is_loaded); list_add_tail(&state->l, &nsim_dev->bpf_bound_progs); prog->aux->offload->dev_priv = state; return 0; } static int nsim_bpf_verifier_prep(struct bpf_prog *prog) { struct nsim_dev *nsim_dev = bpf_offload_dev_priv(prog->aux->offload->offdev); if (!nsim_dev->bpf_bind_accept) return -EOPNOTSUPP; return nsim_bpf_create_prog(nsim_dev, prog); } static int nsim_bpf_translate(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv; state->state = "xlated"; return 0; } static void nsim_bpf_destroy_prog(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; state = prog->aux->offload->dev_priv; WARN(state->is_loaded, "offload state destroyed while program still bound"); debugfs_remove_recursive(state->ddir); list_del(&state->l); kfree(state); } static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { .insn_hook = nsim_bpf_verify_insn, .finalize = nsim_bpf_finalize, .prepare = nsim_bpf_verifier_prep, .translate = nsim_bpf_translate, .destroy = nsim_bpf_destroy_prog, }; static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { if (bpf->prog && bpf->prog->aux->offload) { NSIM_EA(bpf->extack, "attempt to load offloaded prog to drv"); return -EINVAL; } if (ns->netdev->mtu > NSIM_XDP_MAX_MTU) { NSIM_EA(bpf->extack, "MTU too large w/ XDP enabled"); return -EINVAL; } return 0; } static int nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { struct nsim_bpf_bound_prog *state; if (!bpf->prog) return 0; if (!bpf_prog_is_offloaded(bpf->prog->aux)) { NSIM_EA(bpf->extack, "xdpoffload of non-bound program"); return -EINVAL; } state = bpf->prog->aux->offload->dev_priv; if (WARN_ON(strcmp(state->state, "xlated"))) { NSIM_EA(bpf->extack, "offloading program in bad state"); return -EINVAL; } return 0; } static bool nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key) { return e->key && !memcmp(key, e->key, map->key_size); } static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key)) return i; return -ENOENT; } static int nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].key) return -ENOMEM; nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].value) { kfree(nmap->entry[idx].key); nmap->entry[idx].key = NULL; return -ENOMEM; } return 0; } static int nsim_map_get_next_key(struct bpf_offloaded_map *offmap, void *key, void *next_key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx = -ENOENT; mutex_lock(&nmap->mutex); if (key) idx = nsim_map_key_find(offmap, key); if (idx == -ENOENT) idx = 0; else idx++; for (; idx < ARRAY_SIZE(nmap->entry); idx++) { if (nmap->entry[idx].key) { memcpy(next_key, nmap->entry[idx].key, offmap->map.key_size); break; } } mutex_unlock(&nmap->mutex); if (idx == ARRAY_SIZE(nmap->entry)) return -ENOENT; return 0; } static int nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) memcpy(value, nmap->entry[idx].value, offmap->map.value_size); mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static int nsim_map_update_elem(struct bpf_offloaded_map *offmap, void *key, void *value, u64 flags) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx, err = 0; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx < 0 && flags == BPF_EXIST) { err = idx; goto exit_unlock; } if (idx >= 0 && flags == BPF_NOEXIST) { err = -EEXIST; goto exit_unlock; } if (idx < 0) { for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++) if (!nmap->entry[idx].key) break; if (idx == ARRAY_SIZE(nmap->entry)) { err = -E2BIG; goto exit_unlock; } err = nsim_map_alloc_elem(offmap, idx); if (err) goto exit_unlock; } memcpy(nmap->entry[idx].key, key, offmap->map.key_size); memcpy(nmap->entry[idx].value, value, offmap->map.value_size); exit_unlock: mutex_unlock(&nmap->mutex); return err; } static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) return -EINVAL; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) { kfree(nmap->entry[idx].key); kfree(nmap->entry[idx].value); memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx])); } mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static const struct bpf_map_dev_ops nsim_bpf_map_ops = { .map_get_next_key = nsim_map_get_next_key, .map_lookup_elem = nsim_map_lookup_elem, .map_update_elem = nsim_map_update_elem, .map_delete_elem = nsim_map_delete_elem, }; static int nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap; int i, err; if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY && offmap->map.map_type != BPF_MAP_TYPE_HASH)) return -EINVAL; if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS) return -ENOMEM; if (offmap->map.map_flags) return -EINVAL; nmap = kzalloc(sizeof(*nmap), GFP_KERNEL_ACCOUNT); if (!nmap) return -ENOMEM; offmap->dev_priv = nmap; nmap->ns = ns; nmap->map = offmap; mutex_init(&nmap->mutex); if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) { for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { u32 *key; err = nsim_map_alloc_elem(offmap, i); if (err) goto err_free; key = nmap->entry[i].key; *key = i; memset(nmap->entry[i].value, 0, offmap->map.value_size); } } offmap->dev_ops = &nsim_bpf_map_ops; list_add_tail(&nmap->l, &ns->nsim_dev->bpf_bound_maps); return 0; err_free: while (--i >= 0) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } kfree(nmap); return err; } static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } list_del_init(&nmap->l); mutex_destroy(&nmap->mutex); kfree(nmap); } int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct netdevsim *ns = netdev_priv(dev); int err; ASSERT_RTNL(); switch (bpf->command) { case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp); case XDP_SETUP_PROG_HW: err = nsim_setup_prog_hw_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp_hw); case BPF_OFFLOAD_MAP_ALLOC: if (!ns->bpf_map_accept) return -EOPNOTSUPP; return nsim_bpf_map_alloc(ns, bpf->offmap); case BPF_OFFLOAD_MAP_FREE: nsim_bpf_map_free(bpf->offmap); return 0; default: return -EINVAL; } } int nsim_bpf_dev_init(struct nsim_dev *nsim_dev) { int err; INIT_LIST_HEAD(&nsim_dev->bpf_bound_progs); INIT_LIST_HEAD(&nsim_dev->bpf_bound_maps); nsim_dev->ddir_bpf_bound_progs = debugfs_create_dir("bpf_bound_progs", nsim_dev->ddir); if (IS_ERR(nsim_dev->ddir_bpf_bound_progs)) return PTR_ERR(nsim_dev->ddir_bpf_bound_progs); nsim_dev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops, nsim_dev); err = PTR_ERR_OR_ZERO(nsim_dev->bpf_dev); if (err) return err; nsim_dev->bpf_bind_accept = true; debugfs_create_bool("bpf_bind_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_accept); debugfs_create_u32("bpf_bind_verifier_delay", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_delay); nsim_dev->bpf_bind_verifier_accept = true; debugfs_create_bool("bpf_bind_verifier_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_accept); return 0; } void nsim_bpf_dev_exit(struct nsim_dev *nsim_dev) { WARN_ON(!list_empty(&nsim_dev->bpf_bound_progs)); WARN_ON(!list_empty(&nsim_dev->bpf_bound_maps)); bpf_offload_dev_destroy(nsim_dev->bpf_dev); } int nsim_bpf_init(struct netdevsim *ns) { struct dentry *ddir = ns->nsim_dev_port->ddir; int err; err = bpf_offload_dev_netdev_register(ns->nsim_dev->bpf_dev, ns->netdev); if (err) return err; debugfs_create_u32("bpf_offloaded_id", 0400, ddir, &ns->bpf_offloaded_id); ns->bpf_tc_accept = true; debugfs_create_bool("bpf_tc_accept", 0600, ddir, &ns->bpf_tc_accept); debugfs_create_bool("bpf_tc_non_bound_accept", 0600, ddir, &ns->bpf_tc_non_bound_accept); ns->bpf_xdpdrv_accept = true; debugfs_create_bool("bpf_xdpdrv_accept", 0600, ddir, &ns->bpf_xdpdrv_accept); ns->bpf_xdpoffload_accept = true; debugfs_create_bool("bpf_xdpoffload_accept", 0600, ddir, &ns->bpf_xdpoffload_accept); ns->bpf_map_accept = true; debugfs_create_bool("bpf_map_accept", 0600, ddir, &ns->bpf_map_accept); return 0; } void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); bpf_offload_dev_netdev_unregister(ns->nsim_dev->bpf_dev, ns->netdev); }
13 28 13 23 23 28 28 28 28 28 28 13 13 28 28 28 28 13 28 17 17 17 13 13 13 17 17 17 14 13 7 7 7 17 13 17 13 12 13 13 13 13 13 7 13 13 4 4 4 4 4 4 28 28 1 1 1 13 13 13 13 13 13 28 27 8 8 8 28 28 28 28 28 28 28 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 1 1 1 1 1 1 1 1 6 6 6 6 15 15 15 15 15 15 15 15 15 28 28 28 1 28 15 28 13 13 13 13 28 28 28 28 1 27 1 1 1 1 1 1 28 28 28 28 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/if_vlan.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <rdma/ib_cache.h> #include "core_priv.h" struct ib_pkey_cache { int table_len; u16 table[] __counted_by(table_len); }; struct ib_update_work { struct work_struct work; struct ib_event event; bool enforce_security; }; union ib_gid zgid; EXPORT_SYMBOL(zgid); enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; enum gid_table_entry_state { GID_TABLE_ENTRY_INVALID = 1, GID_TABLE_ENTRY_VALID = 2, /* * Indicates that entry is pending to be removed, there may * be active users of this GID entry. * When last user of the GID entry releases reference to it, * GID entry is detached from the table. */ GID_TABLE_ENTRY_PENDING_DEL = 3, }; struct roce_gid_ndev_storage { struct rcu_head rcu_head; struct net_device *ndev; }; struct ib_gid_table_entry { struct kref kref; struct work_struct del_work; struct ib_gid_attr attr; void *context; /* Store the ndev pointer to release reference later on in * call_rcu context because by that time gid_table_entry * and attr might be already freed. So keep a copy of it. * ndev_storage is freed by rcu callback. */ struct roce_gid_ndev_storage *ndev_storage; enum gid_table_entry_state state; }; struct ib_gid_table { int sz; /* In RoCE, adding a GID to the table requires: * (a) Find if this GID is already exists. * (b) Find a free space. * (c) Write the new GID * * Delete requires different set of operations: * (a) Find the GID * (b) Delete it. * **/ /* Any writer to data_vec must hold this lock and the write side of * rwlock. Readers must hold only rwlock. All writers must be in a * sleepable context. */ struct mutex lock; /* rwlock protects data_vec[ix]->state and entry pointer. */ rwlock_t rwlock; struct ib_gid_table_entry **data_vec; /* bit field, each bit indicates the index of default GID */ u32 default_gid_indices; }; static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port) { struct ib_event event; event.device = ib_dev; event.element.port_num = port; event.event = IB_EVENT_GID_CHANGE; ib_dispatch_event_clients(&event); } static const char * const gid_type_str[] = { /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for * user space compatibility reasons. */ [IB_GID_TYPE_IB] = "IB/RoCE v1", [IB_GID_TYPE_ROCE] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) { if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) return gid_type_str[gid_type]; return "Invalid GID type"; } EXPORT_SYMBOL(ib_cache_gid_type_str); /** rdma_is_zero_gid - Check if given GID is zero or not. * @gid: GID to check * Returns true if given GID is zero, returns false otherwise. */ bool rdma_is_zero_gid(const union ib_gid *gid) { return !memcmp(gid, &zgid, sizeof(*gid)); } EXPORT_SYMBOL(rdma_is_zero_gid); /** is_gid_index_default - Check if a given index belongs to * reserved default GIDs or not. * @table: GID table pointer * @index: Index to check in GID table * Returns true if index is one of the reserved default GID index otherwise * returns false. */ static bool is_gid_index_default(const struct ib_gid_table *table, unsigned int index) { return index < 32 && (BIT(index) & table->default_gid_indices); } int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; size_t len; int err = -EINVAL; len = strlen(buf); if (len == 0) return -EINVAL; if (buf[len - 1] == '\n') len--; for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && len == strlen(gid_type_str[i])) { err = i; break; } return err; } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port) { return device->port_data[port].cache.gid; } static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) { return !entry; } static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry) { return entry && entry->state == GID_TABLE_ENTRY_VALID; } static void schedule_free_gid(struct kref *kref) { struct ib_gid_table_entry *entry = container_of(kref, struct ib_gid_table_entry, kref); queue_work(ib_wq, &entry->del_work); } static void put_gid_ndev(struct rcu_head *head) { struct roce_gid_ndev_storage *storage = container_of(head, struct roce_gid_ndev_storage, rcu_head); WARN_ON(!storage->ndev); /* At this point its safe to release netdev reference, * as all callers working on gid_attr->ndev are done * using this netdev. */ dev_put(storage->ndev); kfree(storage); } static void free_gid_entry_locked(struct ib_gid_table_entry *entry) { struct ib_device *device = entry->attr.device; u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__, port_num, entry->attr.index, entry->attr.gid.raw); write_lock_irq(&table->rwlock); /* * The only way to avoid overwriting NULL in table is * by comparing if it is same entry in table or not! * If new entry in table is added by the time we free here, * don't overwrite the table entry. */ if (entry == table->data_vec[entry->attr.index]) table->data_vec[entry->attr.index] = NULL; /* Now this index is ready to be allocated */ write_unlock_irq(&table->rwlock); if (entry->ndev_storage) call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev); kfree(entry); } static void free_gid_entry(struct kref *kref) { struct ib_gid_table_entry *entry = container_of(kref, struct ib_gid_table_entry, kref); free_gid_entry_locked(entry); } /** * free_gid_work - Release reference to the GID entry * @work: Work structure to refer to GID entry which needs to be * deleted. * * free_gid_work() frees the entry from the HCA's hardware table * if provider supports it. It releases reference to netdevice. */ static void free_gid_work(struct work_struct *work) { struct ib_gid_table_entry *entry = container_of(work, struct ib_gid_table_entry, del_work); struct ib_device *device = entry->attr.device; u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); mutex_lock(&table->lock); free_gid_entry_locked(entry); mutex_unlock(&table->lock); } static struct ib_gid_table_entry * alloc_gid_entry(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; struct net_device *ndev; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return NULL; ndev = rcu_dereference_protected(attr->ndev, 1); if (ndev) { entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), GFP_KERNEL); if (!entry->ndev_storage) { kfree(entry); return NULL; } dev_hold(ndev); entry->ndev_storage->ndev = ndev; } kref_init(&entry->kref); memcpy(&entry->attr, attr, sizeof(*attr)); INIT_WORK(&entry->del_work, free_gid_work); entry->state = GID_TABLE_ENTRY_INVALID; return entry; } static void store_gid_entry(struct ib_gid_table *table, struct ib_gid_table_entry *entry) { entry->state = GID_TABLE_ENTRY_VALID; dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n", __func__, entry->attr.port_num, entry->attr.index, entry->attr.gid.raw); lockdep_assert_held(&table->lock); write_lock_irq(&table->rwlock); table->data_vec[entry->attr.index] = entry; write_unlock_irq(&table->rwlock); } static void get_gid_entry(struct ib_gid_table_entry *entry) { kref_get(&entry->kref); } static void put_gid_entry(struct ib_gid_table_entry *entry) { kref_put(&entry->kref, schedule_free_gid); } static void put_gid_entry_locked(struct ib_gid_table_entry *entry) { kref_put(&entry->kref, free_gid_entry); } static int add_roce_gid(struct ib_gid_table_entry *entry) { const struct ib_gid_attr *attr = &entry->attr; int ret; if (!attr->ndev) { dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n", __func__, attr->port_num, attr->index); return -EINVAL; } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { ret = attr->device->ops.add_gid(attr, &entry->context); if (ret) { dev_err(&attr->device->dev, "%s GID add failed port=%u index=%u\n", __func__, attr->port_num, attr->index); return ret; } } return 0; } /** * del_gid - Delete GID table entry * * @ib_dev: IB device whose GID entry to be deleted * @port: Port number of the IB device * @table: GID table of the IB device for a port * @ix: GID entry index to delete * */ static void del_gid(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table, int ix) { struct roce_gid_ndev_storage *ndev_storage; struct ib_gid_table_entry *entry; lockdep_assert_held(&table->lock); dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port, ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); entry = table->data_vec[ix]; entry->state = GID_TABLE_ENTRY_PENDING_DEL; /* * For non RoCE protocol, GID entry slot is ready to use. */ if (!rdma_protocol_roce(ib_dev, port)) table->data_vec[ix] = NULL; write_unlock_irq(&table->rwlock); if (rdma_cap_roce_gid_table(ib_dev, port)) ib_dev->ops.del_gid(&entry->attr, &entry->context); ndev_storage = entry->ndev_storage; if (ndev_storage) { entry->ndev_storage = NULL; rcu_assign_pointer(entry->attr.ndev, NULL); call_rcu(&ndev_storage->rcu_head, put_gid_ndev); } put_gid_entry_locked(entry); } /** * add_modify_gid - Add or modify GID table entry * * @table: GID table in which GID to be added or modified * @attr: Attributes of the GID * * Returns 0 on success or appropriate error code. It accepts zero * GID addition for non RoCE ports for HCA's who report them as valid * GID. However such zero GIDs are not added to the cache. */ static int add_modify_gid(struct ib_gid_table *table, const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; int ret = 0; /* * Invalidate any old entry in the table to make it safe to write to * this index. */ if (is_gid_entry_valid(table->data_vec[attr->index])) del_gid(attr->device, attr->port_num, table, attr->index); /* * Some HCA's report multiple GID entries with only one valid GID, and * leave other unused entries as the zero GID. Convert zero GIDs to * empty table entries instead of storing them. */ if (rdma_is_zero_gid(&attr->gid)) return 0; entry = alloc_gid_entry(attr); if (!entry) return -ENOMEM; if (rdma_protocol_roce(attr->device, attr->port_num)) { ret = add_roce_gid(entry); if (ret) goto done; } store_gid_entry(table, entry); return 0; done: put_gid_entry(entry); return ret; } /* rwlock should be read locked, or lock should be held */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, unsigned long mask, int *pempty) { int i = 0; int found = -1; int empty = pempty ? -1 : 0; while (i < table->sz && (found < 0 || empty < 0)) { struct ib_gid_table_entry *data = table->data_vec[i]; struct ib_gid_attr *attr; int curr_index = i; i++; /* find_gid() is used during GID addition where it is expected * to return a free entry slot which is not duplicate. * Free entry slot is requested and returned if pempty is set, * so lookup free slot only if requested. */ if (pempty && empty < 0) { if (is_gid_entry_free(data) && default_gid == is_gid_index_default(table, curr_index)) { /* * Found an invalid (free) entry; allocate it. * If default GID is requested, then our * found slot must be one of the DEFAULT * reserved slots or we fail. * This ensures that only DEFAULT reserved * slots are used for default property GIDs. */ empty = curr_index; } } /* * Additionally find_gid() is used to find valid entry during * lookup operation; so ignore the entries which are marked as * pending for removal and the entries which are marked as * invalid. */ if (!is_gid_entry_valid(data)) continue; if (found >= 0) continue; attr = &data->attr; if (mask & GID_ATTR_FIND_MASK_GID_TYPE && attr->gid_type != val->gid_type) continue; if (mask & GID_ATTR_FIND_MASK_GID && memcmp(gid, &data->attr.gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && attr->ndev != val->ndev) continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && is_gid_index_default(table, curr_index) != default_gid) continue; found = curr_index; } if (pempty) *pempty = empty; return found; } static void make_default_gid(struct net_device *dev, union ib_gid *gid) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); addrconf_ifid_eui48(&gid->raw[8], dev); } static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { struct ib_gid_table *table; int ret = 0; int empty; int ix; /* Do not allow adding zero GID in support of * IB spec version 1.3 section 4.1.1 point (6) and * section 12.7.10 and section 12.7.20 */ if (rdma_is_zero_gid(gid)) return -EINVAL; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); ix = find_gid(table, gid, attr, default_gid, mask, &empty); if (ix >= 0) goto out_unlock; if (empty < 0) { ret = -ENOSPC; goto out_unlock; } attr->device = ib_dev; attr->index = empty; attr->port_num = port; attr->gid = *gid; ret = add_modify_gid(table, attr); if (!ret) dispatch_gid_change_event(ib_dev, port); out_unlock: mutex_unlock(&table->lock); if (ret) pr_warn("%s: unable to add gid %pI6 error=%d\n", __func__, gid->raw, ret); return ret; } int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV; return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); } static int _ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { struct ib_gid_table *table; int ret = 0; int ix; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); ix = find_gid(table, gid, attr, default_gid, mask, NULL); if (ix < 0) { ret = -EINVAL; goto out_unlock; } del_gid(ib_dev, port, table, ix); dispatch_gid_change_event(ib_dev, port); out_unlock: mutex_unlock(&table->lock); if (ret) pr_debug("%s: can't delete gid %pI6 error=%d\n", __func__, gid->raw, ret); return ret; } int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false); } int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct ib_gid_table *table; int ix; bool deleted = false; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); for (ix = 0; ix < table->sz; ix++) { if (is_gid_entry_valid(table->data_vec[ix]) && table->data_vec[ix]->attr.ndev == ndev) { del_gid(ib_dev, port, table, ix); deleted = true; } } mutex_unlock(&table->lock); if (deleted) dispatch_gid_change_event(ib_dev, port); return 0; } /** * rdma_find_gid_by_port - Returns the GID entry attributes when it finds * a valid GID entry for given search parameters. It searches for the specified * GID value in the local software cache. * @ib_dev: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @port: The port number of the device where the GID value should be searched. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * Returns sgid attributes if the GID is found with valid reference or * returns ERR_PTR for the error. * The caller must invoke rdma_put_gid_attr() to release the reference. */ const struct ib_gid_attr * rdma_find_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, u32 port, struct net_device *ndev) { int local_index; struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; const struct ib_gid_attr *attr; unsigned long flags; if (!rdma_is_port_valid(ib_dev, port)) return ERR_PTR(-ENOENT); table = rdma_gid_table(ib_dev, port); if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { get_gid_entry(table->data_vec[local_index]); attr = &table->data_vec[local_index]->attr; read_unlock_irqrestore(&table->rwlock, flags); return attr; } read_unlock_irqrestore(&table->rwlock, flags); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL(rdma_find_gid_by_port); /** * rdma_find_gid_by_filter - Returns the GID table attribute where a * specified GID value occurs * @ib_dev: The device to query. * @gid: The GID value to search for. * @port: The port number of the device where the GID value could be * searched. * @filter: The filter function is executed on any matching GID in the table. * If the filter function returns true, the corresponding index is returned, * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. * @context: Private data to pass into the call-back. * * rdma_find_gid_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. * */ const struct ib_gid_attr *rdma_find_gid_by_filter( struct ib_device *ib_dev, const union ib_gid *gid, u32 port, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context) { const struct ib_gid_attr *res = ERR_PTR(-ENOENT); struct ib_gid_table *table; unsigned long flags; unsigned int i; if (!rdma_is_port_valid(ib_dev, port)) return ERR_PTR(-EINVAL); table = rdma_gid_table(ib_dev, port); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { struct ib_gid_table_entry *entry = table->data_vec[i]; if (!is_gid_entry_valid(entry)) continue; if (memcmp(gid, &entry->attr.gid, sizeof(*gid))) continue; if (filter(gid, &entry->attr, context)) { get_gid_entry(entry); res = &entry->attr; break; } } read_unlock_irqrestore(&table->rwlock, flags); return res; } static struct ib_gid_table *alloc_gid_table(int sz) { struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); if (!table->data_vec) goto err_free_table; mutex_init(&table->lock); table->sz = sz; rwlock_init(&table->rwlock); return table; err_free_table: kfree(table); return NULL; } static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { int i; if (!table) return; for (i = 0; i < table->sz; i++) { if (is_gid_entry_free(table->data_vec[i])) continue; WARN_ONCE(true, "GID entry ref leak for dev %s index %d ref=%u\n", dev_name(&device->dev), i, kref_read(&table->data_vec[i]->kref)); } mutex_destroy(&table->lock); kfree(table->data_vec); kfree(table); } static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { int i; if (!table) return; mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { if (is_gid_entry_valid(table->data_vec[i])) del_gid(ib_dev, port, table, i); } mutex_unlock(&table->lock); } void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { union ib_gid gid = { }; struct ib_gid_attr gid_attr; unsigned int gid_type; unsigned long mask; mask = GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { if (1UL << gid_type & ~gid_type_mask) continue; gid_attr.gid_type = gid_type; if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { make_default_gid(ndev, &gid); __ib_cache_gid_add(ib_dev, port, &gid, &gid_attr, mask, true); } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) { _ib_cache_gid_del(ib_dev, port, &gid, &gid_attr, mask, true); } } } static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { unsigned int i; unsigned long roce_gid_type_mask; unsigned int num_default_gids; roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); num_default_gids = hweight_long(roce_gid_type_mask); /* Reserve starting indices for default GIDs */ for (i = 0; i < num_default_gids && i < table->sz; i++) table->default_gid_indices |= BIT(i); } static void gid_table_release_one(struct ib_device *ib_dev) { u32 p; rdma_for_each_port (ib_dev, p) { release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); ib_dev->port_data[p].cache.gid = NULL; } } static int _gid_table_setup_one(struct ib_device *ib_dev) { struct ib_gid_table *table; u32 rdma_port; rdma_for_each_port (ib_dev, rdma_port) { table = alloc_gid_table( ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; gid_table_reserve_default(ib_dev, rdma_port, table); ib_dev->port_data[rdma_port].cache.gid = table; } return 0; rollback_table_setup: gid_table_release_one(ib_dev); return -ENOMEM; } static void gid_table_cleanup_one(struct ib_device *ib_dev) { u32 p; rdma_for_each_port (ib_dev, p) cleanup_gid_table_port(ib_dev, p, ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) { int err; err = _gid_table_setup_one(ib_dev); if (err) return err; rdma_roce_rescan_device(ib_dev); return err; } /** * rdma_query_gid - Read the GID content from the GID software cache * @device: Device to query the GID * @port_num: Port number of the device * @index: Index of the GID table entry to read * @gid: Pointer to GID where to store the entry's GID * * rdma_query_gid() only reads the GID entry content for requested device, * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't * hold any reference to the GID table entry in the HCA or software cache. * * Returns 0 on success or appropriate error code. * */ int rdma_query_gid(struct ib_device *device, u32 port_num, int index, union ib_gid *gid) { struct ib_gid_table *table; unsigned long flags; int res; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); if (index < 0 || index >= table->sz) { res = -EINVAL; goto done; } if (!is_gid_entry_valid(table->data_vec[index])) { res = -ENOENT; goto done; } memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); res = 0; done: read_unlock_irqrestore(&table->rwlock, flags); return res; } EXPORT_SYMBOL(rdma_query_gid); /** * rdma_read_gid_hw_context - Read the HW GID context from GID attribute * @attr: Potinter to the GID attribute * * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding * to the SGID attr. Callers are required to already be holding the reference * to an existing GID entry. * * Returns the HW GID context * */ void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr) { return container_of(attr, struct ib_gid_table_entry, attr)->context; } EXPORT_SYMBOL(rdma_read_gid_hw_context); /** * rdma_find_gid - Returns SGID attributes if the matching GID is found. * @device: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * rdma_find_gid() searches for the specified GID value in the software cache. * * Returns GID attributes if a valid GID is found or returns ERR_PTR for the * error. The caller must invoke rdma_put_gid_attr() to release the reference. * */ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, struct net_device *ndev) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; u32 p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; rdma_for_each_port(device, p) { struct ib_gid_table *table; unsigned long flags; int index; table = device->port_data[p].cache.gid; read_lock_irqsave(&table->rwlock, flags); index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); if (index >= 0) { const struct ib_gid_attr *attr; get_gid_entry(table->data_vec[index]); attr = &table->data_vec[index]->attr; read_unlock_irqrestore(&table->rwlock, flags); return attr; } read_unlock_irqrestore(&table->rwlock, flags); } return ERR_PTR(-ENOENT); } EXPORT_SYMBOL(rdma_find_gid); int ib_get_cached_pkey(struct ib_device *device, u32 port_num, int index, u16 *pkey) { struct ib_pkey_cache *cache; unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_pkey); void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, u64 *sn_pfx) { unsigned long flags; read_lock_irqsave(&device->cache_lock, flags); *sn_pfx = device->port_data[port_num].cache.subnet_prefix; read_unlock_irqrestore(&device->cache_lock, flags); } EXPORT_SYMBOL(ib_get_cached_subnet_prefix); int ib_find_cached_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; int partial_ix = -1; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache) { ret = -EINVAL; goto err; } *index = -1; for (i = 0; i < cache->table_len; ++i) if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { if (cache->table[i] & 0x8000) { *index = i; ret = 0; break; } else { partial_ix = i; } } if (ret && partial_ix >= 0) { *index = partial_ix; ret = 0; } err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_cached_pkey); int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache) { ret = -EINVAL; goto err; } *index = -1; for (i = 0; i < cache->table_len; ++i) if (cache->table[i] == pkey) { *index = i; ret = 0; break; } err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_exact_cached_pkey); int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); *lmc = device->port_data[port_num].cache.lmc; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_lmc); int ib_get_cached_port_state(struct ib_device *device, u32 port_num, enum ib_port_state *port_state) { unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); *port_state = device->port_data[port_num].cache.port_state; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_port_state); /** * rdma_get_gid_attr - Returns GID attributes for a port of a device * at a requested gid_index, if a valid GID entry exists. * @device: The device to query. * @port_num: The port number on the device where the GID value * is to be queried. * @index: Index of the GID table entry whose attributes are to * be queried. * * rdma_get_gid_attr() acquires reference count of gid attributes from the * cached GID table. Caller must invoke rdma_put_gid_attr() to release * reference to gid attribute regardless of link layer. * * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error * code. */ const struct ib_gid_attr * rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index) { const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); struct ib_gid_table *table; unsigned long flags; if (!rdma_is_port_valid(device, port_num)) return ERR_PTR(-EINVAL); table = rdma_gid_table(device, port_num); if (index < 0 || index >= table->sz) return ERR_PTR(-EINVAL); read_lock_irqsave(&table->rwlock, flags); if (!is_gid_entry_valid(table->data_vec[index])) goto done; get_gid_entry(table->data_vec[index]); attr = &table->data_vec[index]->attr; done: read_unlock_irqrestore(&table->rwlock, flags); return attr; } EXPORT_SYMBOL(rdma_get_gid_attr); /** * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. * @device: The device to query. * @entries: Entries where GID entries are returned. * @max_entries: Maximum number of entries that can be returned. * Entries array must be allocated to hold max_entries number of entries. * * Returns number of entries on success or appropriate error code. */ ssize_t rdma_query_gid_table(struct ib_device *device, struct ib_uverbs_gid_entry *entries, size_t max_entries) { const struct ib_gid_attr *gid_attr; ssize_t num_entries = 0, ret; struct ib_gid_table *table; u32 port_num, i; struct net_device *ndev; unsigned long flags; rdma_for_each_port(device, port_num) { table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { if (!is_gid_entry_valid(table->data_vec[i])) continue; if (num_entries >= max_entries) { ret = -EINVAL; goto err; } gid_attr = &table->data_vec[i]->attr; memcpy(&entries->gid, &gid_attr->gid, sizeof(gid_attr->gid)); entries->gid_index = gid_attr->index; entries->port_num = gid_attr->port_num; entries->gid_type = gid_attr->gid_type; ndev = rcu_dereference_protected( gid_attr->ndev, lockdep_is_held(&table->rwlock)); if (ndev) entries->netdev_ifindex = ndev->ifindex; num_entries++; entries++; } read_unlock_irqrestore(&table->rwlock, flags); } return num_entries; err: read_unlock_irqrestore(&table->rwlock, flags); return ret; } EXPORT_SYMBOL(rdma_query_gid_table); /** * rdma_put_gid_attr - Release reference to the GID attribute * @attr: Pointer to the GID attribute whose reference * needs to be released. * * rdma_put_gid_attr() must be used to release reference whose * reference is acquired using rdma_get_gid_attr() or any APIs * which returns a pointer to the ib_gid_attr regardless of link layer * of IB or RoCE. * */ void rdma_put_gid_attr(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); put_gid_entry(entry); } EXPORT_SYMBOL(rdma_put_gid_attr); /** * rdma_hold_gid_attr - Get reference to existing GID attribute * * @attr: Pointer to the GID attribute whose reference * needs to be taken. * * Increase the reference count to a GID attribute to keep it from being * freed. Callers are required to already be holding a reference to attribute. * */ void rdma_hold_gid_attr(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); get_gid_entry(entry); } EXPORT_SYMBOL(rdma_hold_gid_attr); /** * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice * which must be in UP state. * * @attr:Pointer to the GID attribute * * Returns pointer to netdevice if the netdevice was attached to GID and * netdevice is in UP state. Caller must hold RCU lock as this API * reads the netdev flags which can change while netdevice migrates to * different net namespace. Returns ERR_PTR with error code otherwise. * */ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); struct ib_device *device = entry->attr.device; struct net_device *ndev = ERR_PTR(-EINVAL); u32 port_num = entry->attr.port_num; struct ib_gid_table *table; unsigned long flags; bool valid; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); valid = is_gid_entry_valid(table->data_vec[attr->index]); if (valid) { ndev = rcu_dereference(attr->ndev); if (!ndev) ndev = ERR_PTR(-ENODEV); } read_unlock_irqrestore(&table->rwlock, flags); return ndev; } EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); static int get_lower_dev_vlan(struct net_device *lower_dev, struct netdev_nested_priv *priv) { u16 *vlan_id = (u16 *)priv->data; if (is_vlan_dev(lower_dev)) *vlan_id = vlan_dev_vlan_id(lower_dev); /* We are interested only in first level vlan device, so * always return 1 to stop iterating over next level devices. */ return 1; } /** * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address * of a GID entry. * * @attr: GID attribute pointer whose L2 fields to be read * @vlan_id: Pointer to vlan id to fill up if the GID entry has * vlan id. It is optional. * @smac: Pointer to smac to fill up for a GID entry. It is optional. * * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id * (if gid entry has vlan) and source MAC, or returns error. */ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, u16 *vlan_id, u8 *smac) { struct netdev_nested_priv priv = { .data = (void *)vlan_id, }; struct net_device *ndev; rcu_read_lock(); ndev = rcu_dereference(attr->ndev); if (!ndev) { rcu_read_unlock(); return -ENODEV; } if (smac) ether_addr_copy(smac, ndev->dev_addr); if (vlan_id) { *vlan_id = 0xffff; if (is_vlan_dev(ndev)) { *vlan_id = vlan_dev_vlan_id(ndev); } else { /* If the netdev is upper device and if it's lower * device is vlan device, consider vlan id of * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, get_lower_dev_vlan, &priv); } } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(rdma_read_gid_l2_fields); static int config_non_roce_gid_cache(struct ib_device *device, u32 port, struct ib_port_attr *tprops) { struct ib_gid_attr gid_attr = {}; struct ib_gid_table *table; int ret = 0; int i; gid_attr.device = device; gid_attr.port_num = port; table = rdma_gid_table(device, port); mutex_lock(&table->lock); for (i = 0; i < tprops->gid_tbl_len; ++i) { if (!device->ops.query_gid) continue; ret = device->ops.query_gid(device, port, i, &gid_attr.gid); if (ret) { dev_warn(&device->dev, "query_gid failed (%d) for index %d\n", ret, i); goto err; } if (rdma_protocol_iwarp(device, port)) { struct net_device *ndev; ndev = ib_device_get_netdev(device, port); if (!ndev) continue; RCU_INIT_POINTER(gid_attr.ndev, ndev); dev_put(ndev); } gid_attr.index = i; tprops->subnet_prefix = be64_to_cpu(gid_attr.gid.global.subnet_prefix); add_modify_gid(table, &gid_attr); } err: mutex_unlock(&table->lock); return ret; } static int ib_cache_update(struct ib_device *device, u32 port, bool update_gids, bool update_pkeys, bool enforce_security) { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL; struct ib_pkey_cache *old_pkey_cache = NULL; int i; int ret; if (!rdma_is_port_valid(device, port)) return -EINVAL; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) return -ENOMEM; ret = ib_query_port(device, port, tprops); if (ret) { dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } if (!rdma_protocol_roce(device, port) && update_gids) { ret = config_non_roce_gid_cache(device, port, tprops); if (ret) goto err; } update_pkeys &= !!tprops->pkey_tbl_len; if (update_pkeys) { pkey_cache = kmalloc(struct_size(pkey_cache, table, tprops->pkey_tbl_len), GFP_KERNEL); if (!pkey_cache) { ret = -ENOMEM; goto err; } pkey_cache->table_len = tprops->pkey_tbl_len; for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { dev_warn(&device->dev, "ib_query_pkey failed (%d) for index %d\n", ret, i); goto err; } } } write_lock_irq(&device->cache_lock); if (update_pkeys) { old_pkey_cache = device->port_data[port].cache.pkey; device->port_data[port].cache.pkey = pkey_cache; } device->port_data[port].cache.lmc = tprops->lmc; device->port_data[port].cache.port_state = tprops->state; device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; write_unlock_irq(&device->cache_lock); if (enforce_security) ib_security_cache_change(device, port, tprops->subnet_prefix); kfree(old_pkey_cache); kfree(tprops); return 0; err: kfree(pkey_cache); kfree(tprops); return ret; } static void ib_cache_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); int ret; /* Before distributing the cache update event, first sync * the cache. */ ret = ib_cache_update(work->event.device, work->event.element.port_num, work->event.event == IB_EVENT_GID_CHANGE, work->event.event == IB_EVENT_PKEY_CHANGE, work->enforce_security); /* GID event is notified already for individual GID entries by * dispatch_gid_change_event(). Hence, notifiy for rest of the * events. */ if (!ret && work->event.event != IB_EVENT_GID_CHANGE) ib_dispatch_event_clients(&work->event); kfree(work); } static void ib_generic_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); ib_dispatch_event_clients(&work->event); kfree(work); } static bool is_cache_update_event(const struct ib_event *event) { return (event->event == IB_EVENT_PORT_ERR || event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER || event->event == IB_EVENT_GID_CHANGE); } /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(const struct ib_event *event) { struct ib_update_work *work; work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return; if (is_cache_update_event(event)) INIT_WORK(&work->work, ib_cache_event_task); else INIT_WORK(&work->work, ib_generic_event_task); work->event = *event; if (event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_GID_CHANGE) work->enforce_security = true; queue_work(ib_wq, &work->work); } EXPORT_SYMBOL(ib_dispatch_event); int ib_cache_setup_one(struct ib_device *device) { u32 p; int err; err = gid_table_setup_one(device); if (err) return err; rdma_for_each_port (device, p) { err = ib_cache_update(device, p, true, true, true); if (err) return err; } return 0; } void ib_cache_release_one(struct ib_device *device) { u32 p; /* * The release function frees all the cache elements. * This function should be called as part of freeing * all the device's resources when the cache could no * longer be accessed. */ rdma_for_each_port (device, p) kfree(device->port_data[p].cache.pkey); gid_table_release_one(device); } void ib_cache_cleanup_one(struct ib_device *device) { /* The cleanup function waits for all in-progress workqueue * elements and cleans up the GID cache. This function should be * called after the device was removed from the devices list and * all clients were removed, so the cache exists but is * non-functional and shouldn't be updated anymore. */ flush_workqueue(ib_wq); gid_table_cleanup_one(device); /* * Flush the wq second time for any pending GID delete work. */ flush_workqueue(ib_wq); }
237 201 208 247 139 238 238 7 204 184 185 185 185 185 185 185 3 3 3 3 380 380 3 3 3 7 7 7 7 7 15 15 15 15 15 1 1 1 1 1 139 139 139 139 139 139 3 139 139 1 1 1 1 1 106 106 106 106 106 102 102 102 102 102 11 11 11 11 11 11 5 2 2 2 2 2 10 10 10 10 10 209 5 1 1 5 2 1 1 2 1 1 1 2 1 139 139 20 3 3 20 3 3 206 204 20 206 230 201 11 201 231 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * These functions manipulate an sctp event. The struct ulpevent is used * to carry notifications and data to the ULP (sockets). * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * La Monte H.P. Yarroll <piggy@acm.org> * Ardelle Fan <ardelle.fan@intel.com> * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/skbuff.h> #include <net/sctp/structs.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc); static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); /* Initialize an ULP event from an given skb. */ static void sctp_ulpevent_init(struct sctp_ulpevent *event, __u16 msg_flags, unsigned int len) { memset(event, 0, sizeof(struct sctp_ulpevent)); event->msg_flags = msg_flags; event->rmem_len = len; } /* Create a new sctp_ulpevent. */ static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff *skb; skb = alloc_skb(size, gfp); if (!skb) goto fail; event = sctp_skb2event(skb); sctp_ulpevent_init(event, msg_flags, skb->truesize); return event; fail: return NULL; } /* Is this a MSG_NOTIFICATION? */ int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event) { return MSG_NOTIFICATION == (event->msg_flags & MSG_NOTIFICATION); } /* Hold the association in case the msg_name needs read out of * the association. */ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, const struct sctp_association *asoc) { struct sctp_chunk *chunk = event->chunk; struct sk_buff *skb; /* Cast away the const, as we are just wanting to * bump the reference count. */ sctp_association_hold((struct sctp_association *)asoc); skb = sctp_event2skb(event); event->asoc = (struct sctp_association *)asoc; atomic_add(event->rmem_len, &event->asoc->rmem_alloc); sctp_skb_set_owner_r(skb, asoc->base.sk); if (chunk && chunk->head_skb && !chunk->head_skb->sk) chunk->head_skb->sk = asoc->base.sk; } /* A simple destructor to give up the reference to the association. */ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) { struct sctp_association *asoc = event->asoc; atomic_sub(event->rmem_len, &asoc->rmem_alloc); sctp_association_put(asoc); } /* Create and initialize an SCTP_ASSOC_CHANGE event. * * 5.3.1.1 SCTP_ASSOC_CHANGE * * Communication notifications inform the ULP that an SCTP association * has either begun or ended. The identifier for a new association is * provided by this notification. * * Note: There is no field checking here. If a field is unused it will be * zero'd out. */ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( const struct sctp_association *asoc, __u16 flags, __u16 state, __u16 error, __u16 outbound, __u16 inbound, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_assoc_change *sac; struct sk_buff *skb; /* If the lower layer passed in the chunk, it will be * an ABORT, so we need to include it in the sac_info. */ if (chunk) { /* Copy the chunk data to a new skb and reserve enough * head room to use as notification. */ skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_assoc_change), 0, gfp); if (!skb) goto fail; /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); /* Include the notification structure */ sac = skb_push(skb, sizeof(struct sctp_assoc_change)); /* Trim the buffer to the right length. */ skb_trim(skb, sizeof(struct sctp_assoc_change) + ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_chunkhdr)); } else { event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); sac = skb_put(skb, sizeof(struct sctp_assoc_change)); } /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_type: * It should be SCTP_ASSOC_CHANGE. */ sac->sac_type = SCTP_ASSOC_CHANGE; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_state: 32 bits (signed integer) * This field holds one of a number of values that communicate the * event that happened to the association. */ sac->sac_state = state; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_flags: 16 bits (unsigned integer) * Currently unused. */ sac->sac_flags = 0; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_length: sizeof (__u32) * This field is the total length of the notification data, including * the notification header. */ sac->sac_length = skb->len; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_error: 32 bits (signed integer) * * If the state was reached due to a error condition (e.g. * COMMUNICATION_LOST) any relevant error information is available in * this field. This corresponds to the protocol error codes defined in * [SCTP]. */ sac->sac_error = error; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_outbound_streams: 16 bits (unsigned integer) * sac_inbound_streams: 16 bits (unsigned integer) * * The maximum number of streams allowed in each direction are * available in sac_outbound_streams and sac_inbound streams. */ sac->sac_outbound_streams = outbound; sac->sac_inbound_streams = inbound; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_assoc_id: sizeof (sctp_assoc_t) * * The association id field, holds the identifier for the association. * All notifications for a given association have the same association * identifier. For TCP style socket, this field is ignored. */ sctp_ulpevent_set_owner(event, asoc); sac->sac_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* Create and initialize an SCTP_PEER_ADDR_CHANGE event. * * Socket Extensions for SCTP - draft-01 * 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * When a destination address on a multi-homed peer encounters a change * an interface details event is sent. */ static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( const struct sctp_association *asoc, const struct sockaddr_storage *aaddr, int flags, int state, int error, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_paddr_change *spc; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_paddr_change), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); spc = skb_put(skb, sizeof(struct sctp_paddr_change)); /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_type: * * It should be SCTP_PEER_ADDR_CHANGE. */ spc->spc_type = SCTP_PEER_ADDR_CHANGE; /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_length: sizeof (__u32) * * This field is the total length of the notification data, including * the notification header. */ spc->spc_length = sizeof(struct sctp_paddr_change); /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_flags: 16 bits (unsigned integer) * Currently unused. */ spc->spc_flags = 0; /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_state: 32 bits (signed integer) * * This field holds one of a number of values that communicate the * event that happened to the address. */ spc->spc_state = state; /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_error: 32 bits (signed integer) * * If the state was reached due to any error condition (e.g. * ADDRESS_UNREACHABLE) any relevant error information is available in * this field. */ spc->spc_error = error; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * spc_assoc_id: sizeof (sctp_assoc_t) * * The association id field, holds the identifier for the association. * All notifications for a given association have the same association * identifier. For TCP style socket, this field is ignored. */ sctp_ulpevent_set_owner(event, asoc); spc->spc_assoc_id = sctp_assoc2id(asoc); /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_aaddr: sizeof (struct sockaddr_storage) * * The affected address field, holds the remote peer's address that is * encountering the change of state. */ memcpy(&spc->spc_aaddr, aaddr, sizeof(struct sockaddr_storage)); /* Map ipv4 address into v4-mapped-on-v6 address. */ sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_to_user( sctp_sk(asoc->base.sk), (union sctp_addr *)&spc->spc_aaddr); return event; fail: return NULL; } void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport, int state, int error) { struct sctp_association *asoc = transport->asoc; struct sockaddr_storage addr; struct sctp_ulpevent *event; if (asoc->state < SCTP_STATE_ESTABLISHED) return; memset(&addr, 0, sizeof(struct sockaddr_storage)); memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state, error, GFP_ATOMIC); if (event) asoc->stream.si->enqueue_event(&asoc->ulpq, event); } /* Create and initialize an SCTP_REMOTE_ERROR notification. * * Note: This assumes that the chunk->skb->data already points to the * operation error payload. * * Socket Extensions for SCTP - draft-01 * 5.3.1.3 SCTP_REMOTE_ERROR * * A remote peer may send an Operational Error message to its peer. * This message indicates a variety of error conditions on an * association. The entire error TLV as it appears on the wire is * included in a SCTP_REMOTE_ERROR event. Please refer to the SCTP * specification [SCTP] and any extensions for a list of possible * error formats. */ struct sctp_ulpevent * sctp_ulpevent_make_remote_error(const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, gfp_t gfp) { struct sctp_remote_error *sre; struct sctp_ulpevent *event; struct sctp_errhdr *ch; struct sk_buff *skb; __be16 cause; int elen; ch = (struct sctp_errhdr *)(chunk->skb->data); cause = ch->cause; elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(*ch); /* Pull off the ERROR header. */ skb_pull(chunk->skb, sizeof(*ch)); /* Copy the skb to a new skb with room for us to prepend * notification with. */ skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp); /* Pull off the rest of the cause TLV from the chunk. */ skb_pull(chunk->skb, elen); if (!skb) goto fail; /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); sre = skb_push(skb, sizeof(*sre)); /* Trim the buffer to the right length. */ skb_trim(skb, sizeof(*sre) + elen); /* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */ memset(sre, 0, sizeof(*sre)); sre->sre_type = SCTP_REMOTE_ERROR; sre->sre_flags = 0; sre->sre_length = skb->len; sre->sre_error = cause; sctp_ulpevent_set_owner(event, asoc); sre->sre_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* Create and initialize a SCTP_SEND_FAILED notification. * * Socket Extensions for SCTP - draft-01 * 5.3.1.4 SCTP_SEND_FAILED */ struct sctp_ulpevent *sctp_ulpevent_make_send_failed( const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, __u32 error, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_send_failed *ssf; struct sk_buff *skb; /* Pull off any padding. */ int len = ntohs(chunk->chunk_hdr->length); /* Make skb with more room so we can prepend notification. */ skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_send_failed), /* headroom */ 0, /* tailroom */ gfp); if (!skb) goto fail; /* Pull off the common chunk header and DATA header. */ skb_pull(skb, sctp_datachk_len(&asoc->stream)); len -= sctp_datachk_len(&asoc->stream); /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); ssf = skb_push(skb, sizeof(struct sctp_send_failed)); /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_type: * It should be SCTP_SEND_FAILED. */ ssf->ssf_type = SCTP_SEND_FAILED; /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_flags: 16 bits (unsigned integer) * The flag value will take one of the following values * * SCTP_DATA_UNSENT - Indicates that the data was never put on * the wire. * * SCTP_DATA_SENT - Indicates that the data was put on the wire. * Note that this does not necessarily mean that the * data was (or was not) successfully delivered. */ ssf->ssf_flags = flags; /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_length: sizeof (__u32) * This field is the total length of the notification data, including * the notification header. */ ssf->ssf_length = sizeof(struct sctp_send_failed) + len; skb_trim(skb, ssf->ssf_length); /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_error: 16 bits (unsigned integer) * This value represents the reason why the send failed, and if set, * will be a SCTP protocol error code as defined in [SCTP] section * 3.3.10. */ ssf->ssf_error = error; /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_info: sizeof (struct sctp_sndrcvinfo) * The original send information associated with the undelivered * message. */ memcpy(&ssf->ssf_info, &chunk->sinfo, sizeof(struct sctp_sndrcvinfo)); /* Per TSVWG discussion with Randy. Allow the application to * reassemble a fragmented message. */ ssf->ssf_info.sinfo_flags = chunk->chunk_hdr->flags; /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_assoc_id: sizeof (sctp_assoc_t) * The association id field, sf_assoc_id, holds the identifier for the * association. All notifications for a given association have the * same association identifier. For TCP style socket, this field is * ignored. */ sctp_ulpevent_set_owner(event, asoc); ssf->ssf_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, __u32 error, gfp_t gfp) { struct sctp_send_failed_event *ssf; struct sctp_ulpevent *event; struct sk_buff *skb; int len; skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp); if (!skb) return NULL; len = ntohs(chunk->chunk_hdr->length); len -= sctp_datachk_len(&asoc->stream); skb_pull(skb, sctp_datachk_len(&asoc->stream)); event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); ssf = skb_push(skb, sizeof(*ssf)); ssf->ssf_type = SCTP_SEND_FAILED_EVENT; ssf->ssf_flags = flags; ssf->ssf_length = sizeof(*ssf) + len; skb_trim(skb, ssf->ssf_length); ssf->ssf_error = error; ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream; ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid; ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context; ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id; ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags; sctp_ulpevent_set_owner(event, asoc); ssf->ssf_assoc_id = sctp_assoc2id(asoc); return event; } /* Create and initialize a SCTP_SHUTDOWN_EVENT notification. * * Socket Extensions for SCTP - draft-01 * 5.3.1.5 SCTP_SHUTDOWN_EVENT */ struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( const struct sctp_association *asoc, __u16 flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_shutdown_event *sse; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_shutdown_event), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); sse = skb_put(skb, sizeof(struct sctp_shutdown_event)); /* Socket Extensions for SCTP * 5.3.1.5 SCTP_SHUTDOWN_EVENT * * sse_type * It should be SCTP_SHUTDOWN_EVENT */ sse->sse_type = SCTP_SHUTDOWN_EVENT; /* Socket Extensions for SCTP * 5.3.1.5 SCTP_SHUTDOWN_EVENT * * sse_flags: 16 bits (unsigned integer) * Currently unused. */ sse->sse_flags = 0; /* Socket Extensions for SCTP * 5.3.1.5 SCTP_SHUTDOWN_EVENT * * sse_length: sizeof (__u32) * This field is the total length of the notification data, including * the notification header. */ sse->sse_length = sizeof(struct sctp_shutdown_event); /* Socket Extensions for SCTP * 5.3.1.5 SCTP_SHUTDOWN_EVENT * * sse_assoc_id: sizeof (sctp_assoc_t) * The association id field, holds the identifier for the association. * All notifications for a given association have the same association * identifier. For TCP style socket, this field is ignored. */ sctp_ulpevent_set_owner(event, asoc); sse->sse_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* Create and initialize a SCTP_ADAPTATION_INDICATION notification. * * Socket Extensions for SCTP * 5.3.1.6 SCTP_ADAPTATION_INDICATION */ struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication( const struct sctp_association *asoc, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_adaptation_event *sai; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_adaptation_event), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); sai = skb_put(skb, sizeof(struct sctp_adaptation_event)); sai->sai_type = SCTP_ADAPTATION_INDICATION; sai->sai_flags = 0; sai->sai_length = sizeof(struct sctp_adaptation_event); sai->sai_adaptation_ind = asoc->peer.adaptation_ind; sctp_ulpevent_set_owner(event, asoc); sai->sai_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* A message has been received. Package this message as a notification * to pass it to the upper layers. Go ahead and calculate the sndrcvinfo * even if filtered out later. * * Socket Extensions for SCTP * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) */ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event = NULL; struct sk_buff *skb = chunk->skb; struct sock *sk = asoc->base.sk; size_t padding, datalen; int rx_count; /* * check to see if we need to make space for this * new skb, expand the rcvbuffer if needed, or drop * the frame */ if (asoc->ep->rcvbuf_policy) rx_count = atomic_read(&asoc->rmem_alloc); else rx_count = atomic_read(&sk->sk_rmem_alloc); datalen = ntohs(chunk->chunk_hdr->length); if (rx_count >= sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, datalen)) goto fail; /* Clone the original skb, sharing the data. */ skb = skb_clone(chunk->skb, gfp); if (!skb) goto fail; /* Now that all memory allocations for this chunk succeeded, we * can mark it as received so the tsn_map is updated correctly. */ if (sctp_tsnmap_mark(&asoc->peer.tsn_map, ntohl(chunk->subh.data_hdr->tsn), chunk->transport)) goto fail_mark; /* First calculate the padding, so we don't inadvertently * pass up the wrong length to the user. * * RFC 2960 - Section 3.2 Chunk Field Descriptions * * The total length of a chunk(including Type, Length and Value fields) * MUST be a multiple of 4 bytes. If the length of the chunk is not a * multiple of 4 bytes, the sender MUST pad the chunk with all zero * bytes and this padding is not included in the chunk length field. * The sender should never pad with more than 3 bytes. The receiver * MUST ignore the padding bytes. */ padding = SCTP_PAD4(datalen) - datalen; /* Fixup cloned skb with just this chunks data. */ skb_trim(skb, chunk->chunk_end - padding - skb->data); /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); /* Initialize event with flags 0 and correct length * Since this is a clone of the original skb, only account for * the data of this chunk as other chunks will be accounted separately. */ sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff)); /* And hold the chunk as we need it for getting the IP headers * later in recvmsg */ sctp_chunk_hold(chunk); event->chunk = chunk; sctp_ulpevent_receive_data(event, asoc); event->stream = ntohs(chunk->subh.data_hdr->stream); if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { event->flags |= SCTP_UNORDERED; event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); } event->tsn = ntohl(chunk->subh.data_hdr->tsn); event->msg_flags |= chunk->chunk_hdr->flags; return event; fail_mark: kfree_skb(skb); fail: return NULL; } /* Create a partial delivery related event. * * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT * * When a receiver is engaged in a partial delivery of a * message this notification will be used to indicate * various events. */ struct sctp_ulpevent *sctp_ulpevent_make_pdapi( const struct sctp_association *asoc, __u32 indication, __u32 sid, __u32 seq, __u32 flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_pdapi_event *pd; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_pdapi_event), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); pd = skb_put(skb, sizeof(struct sctp_pdapi_event)); /* pdapi_type * It should be SCTP_PARTIAL_DELIVERY_EVENT * * pdapi_flags: 16 bits (unsigned integer) * Currently unused. */ pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; pd->pdapi_flags = flags; pd->pdapi_stream = sid; pd->pdapi_seq = seq; /* pdapi_length: 32 bits (unsigned integer) * * This field is the total length of the notification data, including * the notification header. It will generally be sizeof (struct * sctp_pdapi_event). */ pd->pdapi_length = sizeof(struct sctp_pdapi_event); /* pdapi_indication: 32 bits (unsigned integer) * * This field holds the indication being sent to the application. */ pd->pdapi_indication = indication; /* pdapi_assoc_id: sizeof (sctp_assoc_t) * * The association id field, holds the identifier for the association. */ sctp_ulpevent_set_owner(event, asoc); pd->pdapi_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } struct sctp_ulpevent *sctp_ulpevent_make_authkey( const struct sctp_association *asoc, __u16 key_id, __u32 indication, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_authkey_event *ak; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_authkey_event), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); ak = skb_put(skb, sizeof(struct sctp_authkey_event)); ak->auth_type = SCTP_AUTHENTICATION_EVENT; ak->auth_flags = 0; ak->auth_length = sizeof(struct sctp_authkey_event); ak->auth_keynumber = key_id; ak->auth_altkeynumber = 0; ak->auth_indication = indication; /* * The association id field, holds the identifier for the association. */ sctp_ulpevent_set_owner(event, asoc); ak->auth_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* * Socket Extensions for SCTP * 6.3.10. SCTP_SENDER_DRY_EVENT */ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( const struct sctp_association *asoc, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_sender_dry_event *sdry; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_sender_dry_event), MSG_NOTIFICATION, gfp); if (!event) return NULL; skb = sctp_event2skb(event); sdry = skb_put(skb, sizeof(struct sctp_sender_dry_event)); sdry->sender_dry_type = SCTP_SENDER_DRY_EVENT; sdry->sender_dry_flags = 0; sdry->sender_dry_length = sizeof(struct sctp_sender_dry_event); sctp_ulpevent_set_owner(event, asoc); sdry->sender_dry_assoc_id = sctp_assoc2id(asoc); return event; } struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, __u16 stream_num, __be16 *stream_list, gfp_t gfp) { struct sctp_stream_reset_event *sreset; struct sctp_ulpevent *event; struct sk_buff *skb; int length, i; length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num; event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp); if (!event) return NULL; skb = sctp_event2skb(event); sreset = skb_put(skb, length); sreset->strreset_type = SCTP_STREAM_RESET_EVENT; sreset->strreset_flags = flags; sreset->strreset_length = length; sctp_ulpevent_set_owner(event, asoc); sreset->strreset_assoc_id = sctp_assoc2id(asoc); for (i = 0; i < stream_num; i++) sreset->strreset_stream_list[i] = ntohs(stream_list[i]); return event; } struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( const struct sctp_association *asoc, __u16 flags, __u32 local_tsn, __u32 remote_tsn, gfp_t gfp) { struct sctp_assoc_reset_event *areset; struct sctp_ulpevent *event; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_assoc_reset_event), MSG_NOTIFICATION, gfp); if (!event) return NULL; skb = sctp_event2skb(event); areset = skb_put(skb, sizeof(struct sctp_assoc_reset_event)); areset->assocreset_type = SCTP_ASSOC_RESET_EVENT; areset->assocreset_flags = flags; areset->assocreset_length = sizeof(struct sctp_assoc_reset_event); sctp_ulpevent_set_owner(event, asoc); areset->assocreset_assoc_id = sctp_assoc2id(asoc); areset->assocreset_local_tsn = local_tsn; areset->assocreset_remote_tsn = remote_tsn; return event; } struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event( const struct sctp_association *asoc, __u16 flags, __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp) { struct sctp_stream_change_event *schange; struct sctp_ulpevent *event; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_stream_change_event), MSG_NOTIFICATION, gfp); if (!event) return NULL; skb = sctp_event2skb(event); schange = skb_put(skb, sizeof(struct sctp_stream_change_event)); schange->strchange_type = SCTP_STREAM_CHANGE_EVENT; schange->strchange_flags = flags; schange->strchange_length = sizeof(struct sctp_stream_change_event); sctp_ulpevent_set_owner(event, asoc); schange->strchange_assoc_id = sctp_assoc2id(asoc); schange->strchange_instrms = strchange_instrms; schange->strchange_outstrms = strchange_outstrms; return event; } /* Return the notification type, assuming this is a notification * event. */ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event) { union sctp_notification *notification; struct sk_buff *skb; skb = sctp_event2skb(event); notification = (union sctp_notification *) skb->data; return notification->sn_header.sn_type; } /* RFC6458, Section 5.3.2. SCTP Header Information Structure * (SCTP_SNDRCV, DEPRECATED) */ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr) { struct sctp_sndrcvinfo sinfo; if (sctp_ulpevent_is_notification(event)) return; memset(&sinfo, 0, sizeof(sinfo)); sinfo.sinfo_stream = event->stream; sinfo.sinfo_ssn = event->ssn; sinfo.sinfo_ppid = event->ppid; sinfo.sinfo_flags = event->flags; sinfo.sinfo_tsn = event->tsn; sinfo.sinfo_cumtsn = event->cumtsn; sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc); /* Context value that is set via SCTP_CONTEXT socket option. */ sinfo.sinfo_context = event->asoc->default_rcv_context; /* These fields are not used while receiving. */ sinfo.sinfo_timetolive = 0; put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV, sizeof(sinfo), &sinfo); } /* RFC6458, Section 5.3.5 SCTP Receive Information Structure * (SCTP_SNDRCV) */ void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr) { struct sctp_rcvinfo rinfo; if (sctp_ulpevent_is_notification(event)) return; memset(&rinfo, 0, sizeof(struct sctp_rcvinfo)); rinfo.rcv_sid = event->stream; rinfo.rcv_ssn = event->ssn; rinfo.rcv_ppid = event->ppid; rinfo.rcv_flags = event->flags; rinfo.rcv_tsn = event->tsn; rinfo.rcv_cumtsn = event->cumtsn; rinfo.rcv_assoc_id = sctp_assoc2id(event->asoc); rinfo.rcv_context = event->asoc->default_rcv_context; put_cmsg(msghdr, IPPROTO_SCTP, SCTP_RCVINFO, sizeof(rinfo), &rinfo); } /* RFC6458, Section 5.3.6. SCTP Next Receive Information Structure * (SCTP_NXTINFO) */ static void __sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr, const struct sk_buff *skb) { struct sctp_nxtinfo nxtinfo; memset(&nxtinfo, 0, sizeof(nxtinfo)); nxtinfo.nxt_sid = event->stream; nxtinfo.nxt_ppid = event->ppid; nxtinfo.nxt_flags = event->flags; if (sctp_ulpevent_is_notification(event)) nxtinfo.nxt_flags |= SCTP_NOTIFICATION; nxtinfo.nxt_length = skb->len; nxtinfo.nxt_assoc_id = sctp_assoc2id(event->asoc); put_cmsg(msghdr, IPPROTO_SCTP, SCTP_NXTINFO, sizeof(nxtinfo), &nxtinfo); } void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr, struct sock *sk) { struct sk_buff *skb; int err; skb = sctp_skb_recv_datagram(sk, MSG_PEEK | MSG_DONTWAIT, &err); if (skb != NULL) { __sctp_ulpevent_read_nxtinfo(sctp_skb2event(skb), msghdr, skb); /* Just release refcount here. */ kfree_skb(skb); } } /* Do accounting for bytes received and hold a reference to the association * for each skb. */ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc) { struct sk_buff *skb, *frag; skb = sctp_event2skb(event); /* Set the owner and charge rwnd for bytes received. */ sctp_ulpevent_set_owner(event, asoc); sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); if (!skb->data_len) return; /* Note: Not clearing the entire event struct as this is just a * fragment of the real event. However, we still need to do rwnd * accounting. * In general, the skb passed from IP can have only 1 level of * fragments. But we allow multiple levels of fragments. */ skb_walk_frags(skb, frag) sctp_ulpevent_receive_data(sctp_skb2event(frag), asoc); } /* Do accounting for bytes just read by user and release the references to * the association. */ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; unsigned int len; /* Current stack structures assume that the rcv buffer is * per socket. For UDP style sockets this is not true as * multiple associations may be on a single UDP-style socket. * Use the local private area of the skb to track the owning * association. */ skb = sctp_event2skb(event); len = skb->len; if (!skb->data_len) goto done; /* Don't forget the fragments. */ skb_walk_frags(skb, frag) { /* NOTE: skb_shinfos are recursive. Although IP returns * skb's with only 1 level of fragments, SCTP reassembly can * increase the levels. */ sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); } done: sctp_assoc_rwnd_increase(event->asoc, len); sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; skb = sctp_event2skb(event); if (!skb->data_len) goto done; /* Don't forget the fragments. */ skb_walk_frags(skb, frag) { /* NOTE: skb_shinfos are recursive. Although IP returns * skb's with only 1 level of fragments, SCTP reassembly can * increase the levels. */ sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); } done: sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } /* Free a ulpevent that has an owner. It includes releasing the reference * to the owner, updating the rwnd in case of a DATA event and freeing the * skb. */ void sctp_ulpevent_free(struct sctp_ulpevent *event) { if (sctp_ulpevent_is_notification(event)) sctp_ulpevent_release_owner(event); else sctp_ulpevent_release_data(event); kfree_skb(sctp_event2skb(event)); } /* Purge the skb lists holding ulpevents. */ unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list) { struct sk_buff *skb; unsigned int data_unread = 0; while ((skb = skb_dequeue(list)) != NULL) { struct sctp_ulpevent *event = sctp_skb2event(skb); if (!sctp_ulpevent_is_notification(event)) data_unread += skb->len; sctp_ulpevent_free(event); } return data_unread; }
1 1 1 1 7 2 5 2 5 5 3 5 4 1 1 4 4 5 7 1 1 3 4 3 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 // SPDX-License-Identifier: GPL-2.0 /* * fs/bfs/file.c * BFS file operations. * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com> * * Make the file block allocation algorithm understand the size * of the underlying block device. * Copyright (C) 2007 Dmitri Vorobiev <dmitri.vorobiev@gmail.com> * */ #include <linux/fs.h> #include <linux/mpage.h> #include <linux/buffer_head.h> #include "bfs.h" #undef DEBUG #ifdef DEBUG #define dprintf(x...) printf(x) #else #define dprintf(x...) #endif const struct file_operations bfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = filemap_splice_read, }; static int bfs_move_block(unsigned long from, unsigned long to, struct super_block *sb) { struct buffer_head *bh, *new; bh = sb_bread(sb, from); if (!bh) return -EIO; new = sb_getblk(sb, to); memcpy(new->b_data, bh->b_data, bh->b_size); mark_buffer_dirty(new); bforget(bh); brelse(new); return 0; } static int bfs_move_blocks(struct super_block *sb, unsigned long start, unsigned long end, unsigned long where) { unsigned long i; dprintf("%08lx-%08lx->%08lx\n", start, end, where); for (i = start; i <= end; i++) if(bfs_move_block(i, where + i, sb)) { dprintf("failed to move block %08lx -> %08lx\n", i, where + i); return -EIO; } return 0; } static int bfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { unsigned long phys; int err; struct super_block *sb = inode->i_sb; struct bfs_sb_info *info = BFS_SB(sb); struct bfs_inode_info *bi = BFS_I(inode); phys = bi->i_sblock + block; if (!create) { if (phys <= bi->i_eblock) { dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n", create, (unsigned long)block, phys); map_bh(bh_result, sb, phys); } return 0; } /* * If the file is not empty and the requested block is within the * range of blocks allocated for this file, we can grant it. */ if (bi->i_sblock && (phys <= bi->i_eblock)) { dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", create, (unsigned long)block, phys); map_bh(bh_result, sb, phys); return 0; } /* The file will be extended, so let's see if there is enough space. */ if (phys >= info->si_blocks) return -ENOSPC; /* The rest has to be protected against itself. */ mutex_lock(&info->bfs_lock); /* * If the last data block for this file is the last allocated * block, we can extend the file trivially, without moving it * anywhere. */ if (bi->i_eblock == info->si_lf_eblk) { dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", create, (unsigned long)block, phys); map_bh(bh_result, sb, phys); info->si_freeb -= phys - bi->i_eblock; info->si_lf_eblk = bi->i_eblock = phys; mark_inode_dirty(inode); err = 0; goto out; } /* Ok, we have to move this entire file to the next free block. */ phys = info->si_lf_eblk + 1; if (phys + block >= info->si_blocks) { err = -ENOSPC; goto out; } if (bi->i_sblock) { err = bfs_move_blocks(inode->i_sb, bi->i_sblock, bi->i_eblock, phys); if (err) { dprintf("failed to move ino=%08lx -> fs corruption\n", inode->i_ino); goto out; } } else err = 0; dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n", create, (unsigned long)block, phys); bi->i_sblock = phys; phys += block; info->si_lf_eblk = bi->i_eblock = phys; /* * This assumes nothing can write the inode back while we are here * and thus update inode->i_blocks! (XXX) */ info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks; mark_inode_dirty(inode); map_bh(bh_result, sb, phys); out: mutex_unlock(&info->bfs_lock); return err; } static int bfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, bfs_get_block); } static int bfs_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, bfs_get_block); } static void bfs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > inode->i_size) truncate_pagecache(inode, inode->i_size); } static int bfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block); if (unlikely(ret)) bfs_write_failed(mapping, pos + len); return ret; } static sector_t bfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, bfs_get_block); } const struct address_space_operations bfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = bfs_read_folio, .writepages = bfs_writepages, .write_begin = bfs_write_begin, .write_end = generic_write_end, .migrate_folio = buffer_migrate_folio, .bmap = bfs_bmap, }; const struct inode_operations bfs_file_inops;
177 177 132 131 132 132 131 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 // SPDX-License-Identifier: GPL-2.0-only /* * lowlevel.c * * PURPOSE * Low Level Device Routines for the UDF filesystem * * COPYRIGHT * (C) 1999-2001 Ben Fennema * * HISTORY * * 03/26/99 blf Created. */ #include "udfdecl.h" #include <linux/blkdev.h> #include <linux/cdrom.h> #include <linux/uaccess.h> #include "udf_sb.h" unsigned int udf_get_last_session(struct super_block *sb) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); struct cdrom_multisession ms_info; if (!cdi) { udf_debug("CDROMMULTISESSION not supported.\n"); return 0; } ms_info.addr_format = CDROM_LBA; if (cdrom_multisession(cdi, &ms_info) == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba); if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ return ms_info.addr.lba; } return 0; } udf_pblk_t udf_get_last_block(struct super_block *sb) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned long lblock = 0; /* * The cdrom layer call failed or returned obviously bogus value? * Try using the device size... */ if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) { if (sb_bdev_nr_blocks(sb) > ~(udf_pblk_t)0) return 0; lblock = sb_bdev_nr_blocks(sb); } if (lblock) return lblock - 1; return 0; }
52 52 52 52 52 52 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 // SPDX-License-Identifier: GPL-2.0 /* -*- linux-c -*- * sysctl_net_core.c: sysctl interface to net core subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ #include <linux/filter.h> #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/isolation.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> #include <net/hotdata.h> #include <net/proto_memory.h> #include <net/rps.h> #include "dev.h" static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; static int net_msg_warn; /* Unused, but still a sysctl */ int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); /* 0 - Keep current behavior: * IPv4: inherit all current settings from init_net * IPv6: reset all settings to default * 1 - Both inherit all current settings from init_net * 2 - Both reset all settings to default * 3 - Both inherit all settings from current netns */ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, struct cpumask *mask) { char kbuf[128]; int len; if (*ppos || !*lenp) { *lenp = 0; return; } len = min(sizeof(kbuf) - 1, *lenp); len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; return; } if (len < *lenp) kbuf[len++] = '\n'; memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } #endif #ifdef CONFIG_RPS static struct cpumask *rps_default_mask_cow_alloc(struct net *net) { struct cpumask *rps_default_mask; if (net->core.rps_default_mask) return net->core.rps_default_mask; rps_default_mask = kzalloc(cpumask_size(), GFP_KERNEL); if (!rps_default_mask) return NULL; /* pairs with READ_ONCE in rx_queue_default_mask() */ WRITE_ONCE(net->core.rps_default_mask, rps_default_mask); return rps_default_mask; } static int rps_default_mask_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->data; int err = 0; rtnl_lock(); if (write) { struct cpumask *rps_default_mask = rps_default_mask_cow_alloc(net); err = -ENOMEM; if (!rps_default_mask) goto done; err = cpumask_parse(buffer, rps_default_mask); if (err) goto done; err = rps_cpumask_housekeeping(rps_default_mask); if (err) goto done; } else { dump_cpumask(buffer, lenp, ppos, net->core.rps_default_mask ? : cpu_none_mask); } done: rtnl_unlock(); return err; } static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; struct ctl_table tmp = { .data = &size, .maxlen = sizeof(size), .mode = table->mode }; struct rps_sock_flow_table *orig_sock_table, *sock_table; static DEFINE_MUTEX(sock_flow_mutex); mutex_lock(&sock_flow_mutex); orig_sock_table = rcu_dereference_protected( net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (size) { if (size > 1<<29) { /* Enforce limit to prevent overflow */ mutex_unlock(&sock_flow_mutex); return -EINVAL; } size = roundup_pow_of_two(size); if (size != orig_size) { sock_table = vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); if (!sock_table) { mutex_unlock(&sock_flow_mutex); return -ENOMEM; } net_hotdata.rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; for (i = 0; i < size; i++) sock_table->ents[i] = RPS_NO_CPU; } else sock_table = NULL; if (sock_table != orig_sock_table) { rcu_assign_pointer(net_hotdata.rps_sock_flow_table, sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); } if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); kvfree_rcu_mightsleep(orig_sock_table); } } } mutex_unlock(&sock_flow_mutex); return ret; } #endif /* CONFIG_RPS */ #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; cpumask_var_t mask; int i, len, ret = 0; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; if (write) { ret = cpumask_parse(buffer, mask); if (ret) goto done; mutex_lock(&flow_limit_update_mutex); len = sizeof(*cur) + netdev_flow_limit_table_len; for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); cur = rcu_dereference_protected(sd->flow_limit, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); kfree_rcu_mightsleep(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); if (!cur) { /* not unwinding previous changes */ ret = -ENOMEM; goto write_unlock; } cur->num_buckets = netdev_flow_limit_table_len; rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); if (rcu_dereference(sd->flow_limit)) cpumask_set_cpu(i, mask); } rcu_read_unlock(); dump_cpumask(buffer, lenp, ppos, mask); } done: free_cpumask_var(mask); return ret; } static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; mutex_lock(&flow_limit_update_mutex); ptr = table->data; old = *ptr; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write && !is_power_of_2(*ptr)) { *ptr = old; ret = -EINVAL; } mutex_unlock(&flow_limit_update_mutex); return ret; } #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED static int set_default_qdisc(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { .data = id, .maxlen = IFNAMSIZ, }; int ret; qdisc_get_default(id, IFNAMSIZ); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = qdisc_set_default(id); return ret; } #endif static int proc_do_dev_weight(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { static DEFINE_MUTEX(dev_weight_mutex); int ret, weight; mutex_lock(&dev_weight_mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); return ret; } static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); fake_table.data = buf; fake_table.maxlen = sizeof(buf); return proc_dostring(&fake_table, write, buffer, lenp, ppos); } #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; int min = *(int *)table->extra1; int max = *(int *)table->extra2; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; tmp.data = &jit_enable; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (jit_enable < 2 || (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { *(int *)table->data = jit_enable; if (jit_enable == 2) pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); } else { ret = -EPERM; } } if (write && ret && min == max) pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n"); return ret; } # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } # endif /* CONFIG_HAVE_EBPF_JIT */ static int proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif static struct ctl_table net_core_table[] = { { .procname = "mem_pcpu_rsv", .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_mem_pcpu_rsv, }, { .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "dev_weight_rx_bias", .data = &dev_weight_rx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "dev_weight_tx_bias", .data = &dev_weight_tx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "netdev_max_backlog", .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "netdev_rss_key", .data = &netdev_rss_key, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_do_rss_key, }, #ifdef CONFIG_BPF_JIT { .procname = "bpf_jit_enable", .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_bpf_enable, # ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, # else .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, # endif }, # ifdef CONFIG_HAVE_EBPF_JIT { .procname = "bpf_jit_harden", .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, # endif { .procname = "bpf_jit_limit", .data = &bpf_jit_limit, .maxlen = sizeof(long), .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = SYSCTL_LONG_ONE, .extra2 = &bpf_jit_limit_max, }, #endif { .procname = "netdev_tstamp_prequeue", .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "message_cost", .data = &net_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "message_burst", .data = &net_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "tstamp_allow_data", .data = &sysctl_tstamp_allow_data, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = rps_sock_flow_sysctl }, #endif #ifdef CONFIG_NET_FLOW_LIMIT { .procname = "flow_limit_cpu_bitmap", .mode = 0644, .proc_handler = flow_limit_cpu_sysctl }, { .procname = "flow_limit_table_len", .data = &netdev_flow_limit_table_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_RX_BUSY_POLL { .procname = "busy_poll", .data = &sysctl_net_busy_poll, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "busy_read", .data = &sysctl_net_busy_read, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_NET_SCHED { .procname = "default_qdisc", .mode = 0644, .maxlen = IFNAMSIZ, .proc_handler = set_default_qdisc }, #endif { .procname = "netdev_budget", .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "warnings", .data = &net_msg_warn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_skb_frags", .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &max_skb_frags, }, { .procname = "netdev_budget_usecs", .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "fb_tunnels_only_for_init_net", .data = &sysctl_fb_tunnels_only_for_init_net, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "devconf_inherit_init_net", .data = &sysctl_devconf_inherit_init_net, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "high_order_alloc_disable", .data = &net_high_order_alloc_disable_key.key, .maxlen = sizeof(net_high_order_alloc_disable_key), .mode = 0644, .proc_handler = proc_do_static_key, }, { .procname = "gro_normal_batch", .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, { .procname = "netdev_unregister_timeout_secs", .data = &netdev_unregister_timeout_secs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, { .procname = "skb_defer_max", .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, }; static struct ctl_table netns_core_table[] = { #if IS_ENABLED(CONFIG_RPS) { .procname = "rps_default_mask", .data = &init_net, .mode = 0644, .proc_handler = rps_default_mask_sysctl }, #endif { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, { .procname = "optmem_max", .data = &init_net.core.sysctl_optmem_max, .maxlen = sizeof(int), .mode = 0644, .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, .maxlen = sizeof(u8), .mode = 0644, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ { .procname = "wmem_max", .data = &sysctl_wmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) { /* fallback tunnels for initns only */ if (!strncmp(str, "initns", 6)) sysctl_fb_tunnels_only_for_init_net = 1; /* no fallback tunnels anywhere */ else if (!strncmp(str, "none", 4)) sysctl_fb_tunnels_only_for_init_net = 2; return 1; } __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(netns_core_table); struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; for (i = 0; i < table_size; ++i) { if (tbl[i].data == &sysctl_wmem_max) break; tbl[i].data += (char *)net - (char *)&init_net; } for (; i < table_size; ++i) tbl[i].mode &= ~0222; } net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; return 0; err_reg: if (tbl != netns_core_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_core_net_exit(struct net *net) { const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); #if IS_ENABLED(CONFIG_RPS) kfree(net->core.rps_default_mask); #endif kfree(tbl); } static __net_initdata struct pernet_operations sysctl_core_ops = { .init = sysctl_core_net_init, .exit = sysctl_core_net_exit, }; static __init int sysctl_core_init(void) { register_net_sysctl(&init_net, "net/core", net_core_table); return register_pernet_subsys(&sysctl_core_ops); } fs_initcall(sysctl_core_init);
231 312 311 232 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_DEVICE_H #define _SCSI_SCSI_DEVICE_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/blk-mq.h> #include <scsi/scsi.h> #include <linux/atomic.h> #include <linux/sbitmap.h> struct bsg_device; struct device; struct request_queue; struct scsi_cmnd; struct scsi_lun; struct scsi_sense_hdr; typedef __u64 __bitwise blist_flags_t; #define SCSI_SENSE_BUFFERSIZE 96 struct scsi_mode_data { __u32 length; __u16 block_descriptor_length; __u8 medium_type; __u8 device_specific; __u8 header_length; __u8 longlba:1; }; /* * sdev state: If you alter this, you also need to alter scsi_sysfs.c * (for the ascii descriptions) and the state model enforcer: * scsi_lib:scsi_device_set_state(). */ enum scsi_device_state { SDEV_CREATED = 1, /* device created but not added to sysfs * Only internal commands allowed (for inq) */ SDEV_RUNNING, /* device properly configured * All commands allowed */ SDEV_CANCEL, /* beginning to delete device * Only error handler commands allowed */ SDEV_DEL, /* device deleted * no commands allowed */ SDEV_QUIESCE, /* Device quiescent. No block commands * will be accepted, only specials (which * originate in the mid-layer) */ SDEV_OFFLINE, /* Device offlined (by error handling or * user request */ SDEV_TRANSPORT_OFFLINE, /* Offlined by transport class error handler */ SDEV_BLOCK, /* Device blocked by scsi lld. No * scsi commands from user or midlayer * should be issued to the scsi * lld. */ SDEV_CREATED_BLOCK, /* same as above but for created devices */ }; enum scsi_scan_mode { SCSI_SCAN_INITIAL = 0, SCSI_SCAN_RESCAN, SCSI_SCAN_MANUAL, }; enum scsi_device_event { SDEV_EVT_MEDIA_CHANGE = 1, /* media has changed */ SDEV_EVT_INQUIRY_CHANGE_REPORTED, /* 3F 03 UA reported */ SDEV_EVT_CAPACITY_CHANGE_REPORTED, /* 2A 09 UA reported */ SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED, /* 38 07 UA reported */ SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED, /* 2A 01 UA reported */ SDEV_EVT_LUN_CHANGE_REPORTED, /* 3F 0E UA reported */ SDEV_EVT_ALUA_STATE_CHANGE_REPORTED, /* 2A 06 UA reported */ SDEV_EVT_POWER_ON_RESET_OCCURRED, /* 29 00 UA reported */ SDEV_EVT_FIRST = SDEV_EVT_MEDIA_CHANGE, SDEV_EVT_LAST = SDEV_EVT_POWER_ON_RESET_OCCURRED, SDEV_EVT_MAXBITS = SDEV_EVT_LAST + 1 }; struct scsi_event { enum scsi_device_event evt_type; struct list_head node; /* put union of data structures, for non-simple event types, * here */ }; /** * struct scsi_vpd - SCSI Vital Product Data * @rcu: For kfree_rcu(). * @len: Length in bytes of @data. * @data: VPD data as defined in various T10 SCSI standard documents. */ struct scsi_vpd { struct rcu_head rcu; int len; unsigned char data[]; }; struct scsi_device { struct Scsi_Host *host; struct request_queue *request_queue; /* the next two are protected by the host->host_lock */ struct list_head siblings; /* list of all devices on this host */ struct list_head same_target_siblings; /* just the devices sharing same target id */ struct sbitmap budget_map; atomic_t device_blocked; /* Device returned QUEUE_FULL. */ atomic_t restarts; spinlock_t list_lock; struct list_head starved_entry; unsigned short queue_depth; /* How deep of a queue we want */ unsigned short max_queue_depth; /* max queue depth */ unsigned short last_queue_full_depth; /* These two are used by */ unsigned short last_queue_full_count; /* scsi_track_queue_full() */ unsigned long last_queue_full_time; /* last queue full time */ unsigned long queue_ramp_up_period; /* ramp up period in jiffies */ #define SCSI_DEFAULT_RAMP_UP_PERIOD (120 * HZ) unsigned long last_queue_ramp_up; /* last queue ramp up time */ unsigned int id, channel; u64 lun; unsigned int manufacturer; /* Manufacturer of device, for using * vendor-specific cmd's */ unsigned sector_size; /* size in bytes */ void *hostdata; /* available to low-level driver */ unsigned char type; char scsi_level; char inq_periph_qual; /* PQ from INQUIRY data */ struct mutex inquiry_mutex; unsigned char inquiry_len; /* valid bytes in 'inquiry' */ unsigned char * inquiry; /* INQUIRY response data */ const char * vendor; /* [back_compat] point into 'inquiry' ... */ const char * model; /* ... after scan; point to static string */ const char * rev; /* ... "nullnullnullnull" before scan */ #define SCSI_DEFAULT_VPD_LEN 255 /* default SCSI VPD page size (max) */ struct scsi_vpd __rcu *vpd_pg0; struct scsi_vpd __rcu *vpd_pg83; struct scsi_vpd __rcu *vpd_pg80; struct scsi_vpd __rcu *vpd_pg89; struct scsi_vpd __rcu *vpd_pgb0; struct scsi_vpd __rcu *vpd_pgb1; struct scsi_vpd __rcu *vpd_pgb2; struct scsi_vpd __rcu *vpd_pgb7; struct scsi_target *sdev_target; blist_flags_t sdev_bflags; /* black/white flags as also found in * scsi_devinfo.[hc]. For now used only to * pass settings from slave_alloc to scsi * core. */ unsigned int eh_timeout; /* Error handling timeout */ /* * If true, let the high-level device driver (sd) manage the device * power state for system suspend/resume (suspend to RAM and * hibernation) operations. */ unsigned manage_system_start_stop:1; /* * If true, let the high-level device driver (sd) manage the device * power state for runtime device suspand and resume operations. */ unsigned manage_runtime_start_stop:1; /* * If true, let the high-level device driver (sd) manage the device * power state for system shutdown (power off) operations. */ unsigned manage_shutdown:1; /* * If set and if the device is runtime suspended, ask the high-level * device driver (sd) to force a runtime resume of the device. */ unsigned force_runtime_start_on_system_start:1; unsigned removable:1; unsigned changed:1; /* Data invalid due to media change */ unsigned busy:1; /* Used to prevent races */ unsigned lockable:1; /* Able to prevent media removal */ unsigned locked:1; /* Media removal disabled */ unsigned borken:1; /* Tell the Seagate driver to be * painfully slow on this device */ unsigned disconnect:1; /* can disconnect */ unsigned soft_reset:1; /* Uses soft reset option */ unsigned sdtr:1; /* Device supports SDTR messages */ unsigned wdtr:1; /* Device supports WDTR messages */ unsigned ppr:1; /* Device supports PPR messages */ unsigned tagged_supported:1; /* Supports SCSI-II tagged queuing */ unsigned simple_tags:1; /* simple queue tag messages are enabled */ unsigned was_reset:1; /* There was a bus reset on the bus for * this device */ unsigned expecting_cc_ua:1; /* Expecting a CHECK_CONDITION/UNIT_ATTN * because we did a bus reset. */ unsigned use_10_for_rw:1; /* first try 10-byte read / write */ unsigned use_10_for_ms:1; /* first try 10-byte mode sense/select */ unsigned set_dbd_for_ms:1; /* Set "DBD" field in mode sense */ unsigned read_before_ms:1; /* perform a READ before MODE SENSE */ unsigned no_report_opcodes:1; /* no REPORT SUPPORTED OPERATION CODES */ unsigned no_write_same:1; /* no WRITE SAME command */ unsigned use_16_for_rw:1; /* Use read/write(16) over read/write(10) */ unsigned use_16_for_sync:1; /* Use sync (16) over sync (10) */ unsigned skip_ms_page_8:1; /* do not use MODE SENSE page 0x08 */ unsigned skip_ms_page_3f:1; /* do not use MODE SENSE page 0x3f */ unsigned skip_vpd_pages:1; /* do not read VPD pages */ unsigned try_vpd_pages:1; /* attempt to read VPD pages */ unsigned use_192_bytes_for_3f:1; /* ask for 192 bytes from page 0x3f */ unsigned no_start_on_add:1; /* do not issue start on add */ unsigned allow_restart:1; /* issue START_UNIT in error handler */ unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */ unsigned no_uld_attach:1; /* disable connecting to upper level drivers */ unsigned select_no_atn:1; unsigned fix_capacity:1; /* READ_CAPACITY is too high by 1 */ unsigned guess_capacity:1; /* READ_CAPACITY might be too high by 1 */ unsigned retry_hwerror:1; /* Retry HARDWARE_ERROR */ unsigned last_sector_bug:1; /* do not use multisector accesses on SD_LAST_BUGGY_SECTORS */ unsigned no_read_disc_info:1; /* Avoid READ_DISC_INFO cmds */ unsigned no_read_capacity_16:1; /* Avoid READ_CAPACITY_16 cmds */ unsigned try_rc_10_first:1; /* Try READ_CAPACACITY_10 first */ unsigned security_supported:1; /* Supports Security Protocols */ unsigned is_visible:1; /* is the device visible in sysfs */ unsigned wce_default_on:1; /* Cache is ON by default */ unsigned no_dif:1; /* T10 PI (DIF) should be disabled */ unsigned broken_fua:1; /* Don't set FUA bit */ unsigned lun_in_cdb:1; /* Store LUN bits in CDB[1] */ unsigned unmap_limit_for_ws:1; /* Use the UNMAP limit for WRITE SAME */ unsigned rpm_autosuspend:1; /* Enable runtime autosuspend at device * creation time */ unsigned ignore_media_change:1; /* Ignore MEDIA CHANGE on resume */ unsigned silence_suspend:1; /* Do not print runtime PM related messages */ unsigned no_vpd_size:1; /* No VPD size reported in header */ unsigned cdl_supported:1; /* Command duration limits supported */ unsigned cdl_enable:1; /* Enable/disable Command duration limits */ unsigned int queue_stopped; /* request queue is quiesced */ bool offline_already; /* Device offline message logged */ atomic_t disk_events_disable_depth; /* disable depth for disk events */ DECLARE_BITMAP(supported_events, SDEV_EVT_MAXBITS); /* supported events */ DECLARE_BITMAP(pending_events, SDEV_EVT_MAXBITS); /* pending events */ struct list_head event_list; /* asserted events */ struct work_struct event_work; unsigned int max_device_blocked; /* what device_blocked counts down from */ #define SCSI_DEFAULT_DEVICE_BLOCKED 3 atomic_t iorequest_cnt; atomic_t iodone_cnt; atomic_t ioerr_cnt; atomic_t iotmo_cnt; struct device sdev_gendev, sdev_dev; struct work_struct requeue_work; struct scsi_device_handler *handler; void *handler_data; size_t dma_drain_len; void *dma_drain_buf; unsigned int sg_timeout; unsigned int sg_reserved_size; struct bsg_device *bsg_dev; unsigned char access_state; struct mutex state_mutex; enum scsi_device_state sdev_state; struct task_struct *quiesced_by; unsigned long sdev_data[]; } __attribute__((aligned(sizeof(unsigned long)))); #define to_scsi_device(d) \ container_of(d, struct scsi_device, sdev_gendev) #define class_to_sdev(d) \ container_of(d, struct scsi_device, sdev_dev) #define transport_class_to_sdev(class_dev) \ to_scsi_device(class_dev->parent) #define sdev_dbg(sdev, fmt, a...) \ dev_dbg(&(sdev)->sdev_gendev, fmt, ##a) /* * like scmd_printk, but the device name is passed in * as a string pointer */ __printf(4, 5) void sdev_prefix_printk(const char *, const struct scsi_device *, const char *, const char *, ...); #define sdev_printk(l, sdev, fmt, a...) \ sdev_prefix_printk(l, sdev, NULL, fmt, ##a) __printf(3, 4) void scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...); #define scmd_dbg(scmd, fmt, a...) \ do { \ struct request *__rq = scsi_cmd_to_rq((scmd)); \ \ if (__rq->q->disk) \ sdev_dbg((scmd)->device, "[%s] " fmt, \ __rq->q->disk->disk_name, ##a); \ else \ sdev_dbg((scmd)->device, fmt, ##a); \ } while (0) enum scsi_target_state { STARGET_CREATED = 1, STARGET_RUNNING, STARGET_REMOVE, STARGET_CREATED_REMOVE, STARGET_DEL, }; /* * scsi_target: representation of a scsi target, for now, this is only * used for single_lun devices. If no one has active IO to the target, * starget_sdev_user is NULL, else it points to the active sdev. */ struct scsi_target { struct scsi_device *starget_sdev_user; struct list_head siblings; struct list_head devices; struct device dev; struct kref reap_ref; /* last put renders target invisible */ unsigned int channel; unsigned int id; /* target id ... replace * scsi_device.id eventually */ unsigned int create:1; /* signal that it needs to be added */ unsigned int single_lun:1; /* Indicates we should only * allow I/O to one of the luns * for the device at a time. */ unsigned int pdt_1f_for_no_lun:1; /* PDT = 0x1f * means no lun present. */ unsigned int no_report_luns:1; /* Don't use * REPORT LUNS for scanning. */ unsigned int expecting_lun_change:1; /* A device has reported * a 3F/0E UA, other devices on * the same target will also. */ /* commands actually active on LLD. */ atomic_t target_busy; atomic_t target_blocked; /* * LLDs should set this in the slave_alloc host template callout. * If set to zero then there is not limit. */ unsigned int can_queue; unsigned int max_target_blocked; #define SCSI_DEFAULT_TARGET_BLOCKED 3 char scsi_level; enum scsi_target_state state; void *hostdata; /* available to low-level driver */ unsigned long starget_data[]; /* for the transport */ /* starget_data must be the last element!!!! */ } __attribute__((aligned(sizeof(unsigned long)))); #define to_scsi_target(d) container_of(d, struct scsi_target, dev) static inline struct scsi_target *scsi_target(struct scsi_device *sdev) { return to_scsi_target(sdev->sdev_gendev.parent); } #define transport_class_to_starget(class_dev) \ to_scsi_target(class_dev->parent) #define starget_printk(prefix, starget, fmt, a...) \ dev_printk(prefix, &(starget)->dev, fmt, ##a) extern struct scsi_device *__scsi_add_device(struct Scsi_Host *, uint, uint, u64, void *hostdata); extern int scsi_add_device(struct Scsi_Host *host, uint channel, uint target, u64 lun); extern int scsi_register_device_handler(struct scsi_device_handler *scsi_dh); extern void scsi_remove_device(struct scsi_device *); extern int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh); void scsi_attach_vpd(struct scsi_device *sdev); void scsi_cdl_check(struct scsi_device *sdev); int scsi_cdl_enable(struct scsi_device *sdev, bool enable); extern struct scsi_device *scsi_device_from_queue(struct request_queue *q); extern int __must_check scsi_device_get(struct scsi_device *); extern void scsi_device_put(struct scsi_device *); extern struct scsi_device *scsi_device_lookup(struct Scsi_Host *, uint, uint, u64); extern struct scsi_device *__scsi_device_lookup(struct Scsi_Host *, uint, uint, u64); extern struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *, u64); extern struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *, u64); extern void starget_for_each_device(struct scsi_target *, void *, void (*fn)(struct scsi_device *, void *)); extern void __starget_for_each_device(struct scsi_target *, void *, void (*fn)(struct scsi_device *, void *)); /* only exposed to implement shost_for_each_device */ extern struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *, struct scsi_device *); /** * shost_for_each_device - iterate over all devices of a host * @sdev: the &struct scsi_device to use as a cursor * @shost: the &struct scsi_host to iterate over * * Iterator that returns each device attached to @shost. This loop * takes a reference on each device and releases it at the end. If * you break out of the loop, you must call scsi_device_put(sdev). */ #define shost_for_each_device(sdev, shost) \ for ((sdev) = __scsi_iterate_devices((shost), NULL); \ (sdev); \ (sdev) = __scsi_iterate_devices((shost), (sdev))) /** * __shost_for_each_device - iterate over all devices of a host (UNLOCKED) * @sdev: the &struct scsi_device to use as a cursor * @shost: the &struct scsi_host to iterate over * * Iterator that returns each device attached to @shost. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason to use this is because you need to access the * device list in interrupt context. Otherwise you really want to use * shost_for_each_device instead. */ #define __shost_for_each_device(sdev, shost) \ list_for_each_entry((sdev), &((shost)->__devices), siblings) extern int scsi_change_queue_depth(struct scsi_device *, int); extern int scsi_track_queue_full(struct scsi_device *, int); extern int scsi_set_medium_removal(struct scsi_device *, char); int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, int subpage, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, struct scsi_sense_hdr *sshdr); extern int scsi_get_vpd_page(struct scsi_device *, u8 page, unsigned char *buf, int buf_len); int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode, unsigned short sa); extern int scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state); extern struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type, gfp_t gfpflags); extern void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt); extern void sdev_evt_send_simple(struct scsi_device *sdev, enum scsi_device_event evt_type, gfp_t gfpflags); extern int scsi_device_quiesce(struct scsi_device *sdev); extern void scsi_device_resume(struct scsi_device *sdev); extern void scsi_target_quiesce(struct scsi_target *); extern void scsi_target_resume(struct scsi_target *); extern void scsi_scan_target(struct device *parent, unsigned int channel, unsigned int id, u64 lun, enum scsi_scan_mode rescan); extern void scsi_target_reap(struct scsi_target *); void scsi_block_targets(struct Scsi_Host *shost, struct device *dev); extern void scsi_target_unblock(struct device *, enum scsi_device_state); extern void scsi_remove_target(struct device *); extern const char *scsi_device_state_name(enum scsi_device_state); extern int scsi_is_sdev_device(const struct device *); extern int scsi_is_target_device(const struct device *); extern void scsi_sanitize_inquiry_string(unsigned char *s, int len); /* * scsi_execute_cmd users can set scsi_failure.result to have * scsi_check_passthrough fail/retry a command. scsi_failure.result can be a * specific host byte or message code, or SCMD_FAILURE_RESULT_ANY can be used * to match any host or message code. */ #define SCMD_FAILURE_RESULT_ANY 0x7fffffff /* * Set scsi_failure.result to SCMD_FAILURE_STAT_ANY to fail/retry any failure * scsi_status_is_good returns false for. */ #define SCMD_FAILURE_STAT_ANY 0xff /* * The following can be set to the scsi_failure sense, asc and ascq fields to * match on any sense, ASC, or ASCQ value. */ #define SCMD_FAILURE_SENSE_ANY 0xff #define SCMD_FAILURE_ASC_ANY 0xff #define SCMD_FAILURE_ASCQ_ANY 0xff /* Always retry a matching failure. */ #define SCMD_FAILURE_NO_LIMIT -1 struct scsi_failure { int result; u8 sense; u8 asc; u8 ascq; /* * Number of times scsi_execute_cmd will retry the failure. It does * not count for the total_allowed. */ s8 allowed; /* Number of times the failure has been retried. */ s8 retries; }; struct scsi_failures { /* * If a scsi_failure does not have a retry limit setup this limit will * be used. */ int total_allowed; int total_retries; struct scsi_failure *failure_definitions; }; /* Optional arguments to scsi_execute_cmd */ struct scsi_exec_args { unsigned char *sense; /* sense buffer */ unsigned int sense_len; /* sense buffer len */ struct scsi_sense_hdr *sshdr; /* decoded sense header */ blk_mq_req_flags_t req_flags; /* BLK_MQ_REQ flags */ int scmd_flags; /* SCMD flags */ int *resid; /* residual length */ struct scsi_failures *failures; /* failures to retry */ }; int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd, blk_opf_t opf, void *buffer, unsigned int bufflen, int timeout, int retries, const struct scsi_exec_args *args); void scsi_failures_reset_retries(struct scsi_failures *failures); extern void sdev_disable_disk_events(struct scsi_device *sdev); extern void sdev_enable_disk_events(struct scsi_device *sdev); extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t); extern int scsi_vpd_tpg_id(struct scsi_device *, int *); #ifdef CONFIG_PM extern int scsi_autopm_get_device(struct scsi_device *); extern void scsi_autopm_put_device(struct scsi_device *); #else static inline int scsi_autopm_get_device(struct scsi_device *d) { return 0; } static inline void scsi_autopm_put_device(struct scsi_device *d) {} #endif /* CONFIG_PM */ static inline int __must_check scsi_device_reprobe(struct scsi_device *sdev) { return device_reprobe(&sdev->sdev_gendev); } static inline unsigned int sdev_channel(struct scsi_device *sdev) { return sdev->channel; } static inline unsigned int sdev_id(struct scsi_device *sdev) { return sdev->id; } #define scmd_id(scmd) sdev_id((scmd)->device) #define scmd_channel(scmd) sdev_channel((scmd)->device) /* * checks for positions of the SCSI state machine */ static inline int scsi_device_online(struct scsi_device *sdev) { return (sdev->sdev_state != SDEV_OFFLINE && sdev->sdev_state != SDEV_TRANSPORT_OFFLINE && sdev->sdev_state != SDEV_DEL); } static inline int scsi_device_blocked(struct scsi_device *sdev) { return sdev->sdev_state == SDEV_BLOCK || sdev->sdev_state == SDEV_CREATED_BLOCK; } static inline int scsi_device_created(struct scsi_device *sdev) { return sdev->sdev_state == SDEV_CREATED || sdev->sdev_state == SDEV_CREATED_BLOCK; } int scsi_internal_device_block_nowait(struct scsi_device *sdev); int scsi_internal_device_unblock_nowait(struct scsi_device *sdev, enum scsi_device_state new_state); /* accessor functions for the SCSI parameters */ static inline int scsi_device_sync(struct scsi_device *sdev) { return sdev->sdtr; } static inline int scsi_device_wide(struct scsi_device *sdev) { return sdev->wdtr; } static inline int scsi_device_dt(struct scsi_device *sdev) { return sdev->ppr; } static inline int scsi_device_dt_only(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return (sdev->inquiry[56] & 0x0c) == 0x04; } static inline int scsi_device_ius(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return sdev->inquiry[56] & 0x01; } static inline int scsi_device_qas(struct scsi_device *sdev) { if (sdev->inquiry_len < 57) return 0; return sdev->inquiry[56] & 0x02; } static inline int scsi_device_enclosure(struct scsi_device *sdev) { return sdev->inquiry ? (sdev->inquiry[6] & (1<<6)) : 1; } static inline int scsi_device_protection(struct scsi_device *sdev) { if (sdev->no_dif) return 0; return sdev->scsi_level > SCSI_2 && sdev->inquiry[5] & (1<<0); } static inline int scsi_device_tpgs(struct scsi_device *sdev) { return sdev->inquiry ? (sdev->inquiry[5] >> 4) & 0x3 : 0; } /** * scsi_device_supports_vpd - test if a device supports VPD pages * @sdev: the &struct scsi_device to test * * If the 'try_vpd_pages' flag is set it takes precedence. * Otherwise we will assume VPD pages are supported if the * SCSI level is at least SPC-3 and 'skip_vpd_pages' is not set. */ static inline int scsi_device_supports_vpd(struct scsi_device *sdev) { /* Attempt VPD inquiry if the device blacklist explicitly calls * for it. */ if (sdev->try_vpd_pages) return 1; /* * Although VPD inquiries can go to SCSI-2 type devices, * some USB ones crash on receiving them, and the pages * we currently ask for are mandatory for SPC-2 and beyond */ if (sdev->scsi_level >= SCSI_SPC_2 && !sdev->skip_vpd_pages) return 1; return 0; } static inline int scsi_device_busy(struct scsi_device *sdev) { return sbitmap_weight(&sdev->budget_map); } #define MODULE_ALIAS_SCSI_DEVICE(type) \ MODULE_ALIAS("scsi:t-" __stringify(type) "*") #define SCSI_DEVICE_MODALIAS_FMT "scsi:t-0x%02x" #endif /* _SCSI_SCSI_DEVICE_H */
44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 /* SPDX-License-Identifier: GPL-2.0 */ /* * NFS internal definitions */ #include "nfs4_fs.h" #include <linux/fs_context.h> #include <linux/security.h> #include <linux/crc32.h> #include <linux/sunrpc/addr.h> #include <linux/nfs_page.h> #include <linux/wait_bit.h> #define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; struct nfs_string; struct nfs_pageio_descriptor; static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; } static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) { if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) return 0; return 1; } static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) { if (!(NFS_SB(dentry->d_sb)->flags & NFS_MOUNT_SOFTREVAL)) return false; if (!d_is_positive(dentry) || !NFS_FH(d_inode(dentry))->size) return false; return true; } static inline fmode_t flags_to_mode(int flags) { fmode_t res = (__force fmode_t)flags & FMODE_EXEC; if ((flags & O_ACCMODE) != O_WRONLY) res |= FMODE_READ; if ((flags & O_ACCMODE) != O_RDONLY) res |= FMODE_WRITE; return res; } /* * Note: RFC 1813 doesn't limit the number of auth flavors that * a server can return, so make something up. */ #define NFS_MAX_SECFLAVORS (12) /* * Value used if the user did not specify a port value. */ #define NFS_UNSPEC_PORT (-1) #define NFS_UNSPEC_RETRANS (UINT_MAX) #define NFS_UNSPEC_TIMEO (UINT_MAX) struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ const struct sockaddr_storage *addr; /* Address of the server */ const char *nodename; /* Hostname of the client */ const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; unsigned int nconnect; unsigned int max_connect; struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; struct xprtsec_parms xprtsec; unsigned long connect_timeout; unsigned long reconnect_timeout; }; /* * In-kernel mount arguments */ struct nfs_fs_context { bool internal; bool skip_reconfig_option_check; bool need_mount; bool sloppy; unsigned int flags; /* NFS{,4}_MOUNT_* flags */ unsigned int rsize, wsize; unsigned int timeo, retrans; unsigned int acregmin, acregmax; unsigned int acdirmin, acdirmax; unsigned int namlen; unsigned int options; unsigned int bsize; struct nfs_auth_info auth_info; rpc_authflavor_t selected_flavor; struct xprtsec_parms xprtsec; char *client_address; unsigned int version; unsigned int minorversion; char *fscache_uniq; unsigned short protofamily; unsigned short mountfamily; bool has_sec_mnt_opts; int lock_status; struct { union { struct sockaddr address; struct sockaddr_storage _address; }; size_t addrlen; char *hostname; u32 version; int port; unsigned short protocol; } mount_server; struct { union { struct sockaddr address; struct sockaddr_storage _address; }; size_t addrlen; char *hostname; char *export_path; int port; unsigned short protocol; unsigned short nconnect; unsigned short max_connect; unsigned short export_path_len; } nfs_server; struct nfs_fh *mntfh; struct nfs_server *server; struct nfs_subversion *nfs_mod; /* Information for a cloned mount. */ struct nfs_clone_mount { struct super_block *sb; struct dentry *dentry; struct nfs_fattr *fattr; unsigned int inherited_bsize; } clone_data; }; enum nfs_lock_status { NFS_LOCK_NOT_SET = 0, NFS_LOCK_LOCK = 1, NFS_LOCK_NOLOCK = 2, }; #define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) #define nfs_ferrorf(fc, fac, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) #define nfs_invalf(fc, fmt, ...) ((fc)->log.log ? \ invalf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); -EINVAL; })) #define nfs_finvalf(fc, fac, fmt, ...) ((fc)->log.log ? \ invalf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); -EINVAL; })) #define nfs_warnf(fc, fmt, ...) ((fc)->log.log ? \ warnf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) #define nfs_fwarnf(fc, fac, fmt, ...) ((fc)->log.log ? \ warnf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc) { return fc->fs_private; } /* mount_clnt.c */ struct nfs_mount_request { struct sockaddr_storage *sap; size_t salen; char *hostname; char *dirpath; u32 version; unsigned short protocol; struct nfs_fh *fh; int noresvport; unsigned int *auth_flav_len; rpc_authflavor_t *auth_flavs; struct net *net; }; extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans); extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern void nfs_clients_exit(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *); int nfs_probe_server(struct nfs_server *, struct nfs_fh *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); struct nfs_server *nfs_alloc_server(void); void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); extern void nfs_put_client(struct nfs_client *); extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, struct nfs4_sessionid *, u32); extern struct nfs_server *nfs_create_server(struct fs_context *); extern void nfs4_server_set_init_caps(struct nfs_server *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, struct sockaddr_storage *sap, size_t salen, struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); extern bool nfs_client_init_is_complete(const struct nfs_client *clp); extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr_storage *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); extern int nfs_fs_proc_net_init(struct net *net); extern void nfs_fs_proc_net_exit(struct net *net); #else static inline int nfs_fs_proc_net_init(struct net *net) { return 0; } static inline void nfs_fs_proc_net_exit(struct net *net) { } static inline int nfs_fs_proc_init(void) { return 0; } static inline void nfs_fs_proc_exit(void) { } #endif /* callback_xdr.c */ extern const struct svc_version nfs4_callback_version1; extern const struct svc_version nfs4_callback_version4; /* fs_context.c */ extern struct file_system_type nfs_fs_type; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); extern int __init nfs_init_readpagecache(void); extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); extern void nfs_destroy_writepagecache(void); extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, const struct cred *cred, const struct nfs_rpc_ops *rpc_ops, const struct rpc_call_ops *call_ops, int how, int flags); void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state; } /* nfs2xdr.c */ extern const struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); /* nfs3xdr.c */ extern const struct rpc_procinfo nfs3_procedures[]; extern int nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); /* nfs4xdr.c */ #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); #endif #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; extern const u32 nfs41_maxwrite_overhead; extern const u32 nfs41_maxgetdevinfo_overhead; #endif /* nfs4proc.c */ #if IS_ENABLED(CONFIG_NFS_V4) extern const struct rpc_procinfo nfs4_procedures[]; #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); static inline struct nfs4_label * nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) { if (!dst || !src) return NULL; if (src->len > NFS4_MAXLABELLEN) return NULL; dst->lfs = src->lfs; dst->pi = src->pi; dst->len = src->len; memcpy(dst->label, src->label, src->len); return dst; } static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) nfsi->cache_validity |= NFS_INO_INVALID_LABEL; } #else static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } static inline struct nfs4_label * nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) { return NULL; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ extern void nfs_readdir_record_entry_cache_hit(struct inode *dir); extern void nfs_readdir_record_entry_cache_miss(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); int nfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int nfs_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); #ifdef CONFIG_NFS_V4_2 static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) { if (!(server->caps & NFS_CAP_XATTR)) return 0; return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST; } #else static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) { return 0; } #endif /* file.c */ int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); int nfs_file_mmap(struct file *, struct vm_area_struct *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_free_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); extern void nfs_zap_acl_cache(struct inode *inode); extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); /* super.c */ extern const struct super_operations nfs_sops; bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); int nfs_try_get_tree(struct fs_context *); int nfs_get_tree_common(struct fs_context *); void nfs_kill_super(struct super_block *); extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); extern int nfs_client_for_each_server(struct nfs_client *clp, int (*fn)(struct nfs_server *, void *), void *data); #ifdef CONFIG_NFS_FSCACHE extern const struct netfs_request_ops nfs_netfs_ops; #endif /* io.c */ extern void nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); extern void nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); extern void nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) { return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, unsigned flags); extern struct vfsmount *nfs_d_automount(struct path *path); int nfs_submount(struct fs_context *, struct nfs_server *); int nfs_do_submount(struct fs_context *); /* getroot.c */ extern int nfs_get_root(struct super_block *s, struct fs_context *fc); #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool); #endif struct nfs_pgio_completion_ops; /* read.c */ extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size); extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio); extern void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); int nfs_show_devname(struct seq_file *, struct dentry *); int nfs_show_path(struct seq_file *, struct dentry *); int nfs_show_stats(struct seq_file *, struct dentry *); int nfs_reconfigure(struct fs_context *); /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, int how, int flags); extern void nfs_init_commit(struct nfs_commit_data *data, struct list_head *head, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo); int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max); unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_request_remove_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); int nfs_filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); #ifdef CONFIG_NFS_V4_1 static inline void pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets, unsigned int nbuckets) { unsigned int i; for (i = 0; i < nbuckets; i++) buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; } static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { struct pnfs_commit_array *array; rcu_read_lock(); list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, array->nbuckets); rcu_read_unlock(); } #else static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { } #endif #ifdef CONFIG_MIGRATION int nfs_migrate_folio(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); #else #define nfs_migrate_folio NULL #endif static inline int nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, const struct nfs_write_verifier *v2) { return memcmp(v1->data, v2->data, sizeof(v1->data)); } static inline bool nfs_write_match_verf(const struct nfs_writeverf *verf, struct nfs_page *req) { return verf->committed > NFS_UNSTABLE && !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } static inline gfp_t nfs_io_gfp_mask(void) { if (current->flags & PF_WQ_WORKER) return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; return GFP_KERNEL; } /* * Special version of should_remove_suid() that ignores capabilities. */ static inline int nfs_should_remove_suid(const struct inode *inode) { umode_t mode = inode->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; /* * sgid without any exec bits is just a mandatory locking mark; leave * it alone. If some exec bits are set, it's a real sgid; kill it. */ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; if (unlikely(kill && S_ISREG(mode))) return kill; return 0; } /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, struct dentry *old_dentry, struct dentry *new_dentry, void (*complete)(struct rpc_task *, struct nfs_renamedata *)); extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, const struct cred *cred); extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, const struct cred *cred); extern void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { struct super_block *sb = inode->i_sb; if (sb && nfs_sb_active(sb)) { if (igrab(inode)) return inode; nfs_sb_deactive(sb); } return NULL; } static inline void nfs_iput_and_deactive(struct inode *inode) { if (inode != NULL) { struct super_block *sb = inode->i_sb; iput(inode); nfs_sb_deactive(sb); } } /* * Determine the device name as a string */ static inline char *nfs_devname(struct dentry *dentry, char *buffer, ssize_t buflen) { char *dummy; return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); } /* * Determine the actual block size (and log2 thereof) */ static inline unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) { /* make sure blocksize is a power of two */ if ((bsize & (bsize - 1)) || nrbitsp) { unsigned char nrbits; for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--) ; bsize = 1UL << nrbits; if (nrbitsp) *nrbitsp = nrbits; } return bsize; } /* * Calculate the number of 512byte blocks used. */ static inline blkcnt_t nfs_calc_block_size(u64 tsize) { blkcnt_t used = (tsize + 511) >> 9; return (used > ULONG_MAX) ? ULONG_MAX : used; } /* * Compute and set NFS server blocksize */ static inline unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) { if (bsize < NFS_MIN_FILE_IO_SIZE) bsize = NFS_DEF_FILE_IO_SIZE; else if (bsize >= NFS_MAX_FILE_IO_SIZE) bsize = NFS_MAX_FILE_IO_SIZE; return nfs_block_bits(bsize, nrbitsp); } /* * Compute and set NFS server rsize / wsize */ static inline unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto) { if (iosize < NFS_MIN_FILE_IO_SIZE) iosize = NFS_DEF_FILE_IO_SIZE; else if (iosize >= NFS_MAX_FILE_IO_SIZE) iosize = NFS_MAX_FILE_IO_SIZE; if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE) return nfs_block_bits(iosize, NULL); return iosize & PAGE_MASK; } /* * Determine the maximum file size for a superblock */ static inline void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) { sb->s_maxbytes = (loff_t)maxfilesize; if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) sb->s_maxbytes = MAX_LFS_FILESIZE; } /* * Record the page as unstable (an extra writeback period) and mark its * inode as dirty. */ static inline void nfs_folio_mark_unstable(struct folio *folio, struct nfs_commit_info *cinfo) { if (folio && !cinfo->dreq) { struct inode *inode = folio->mapping->host; long nr = folio_nr_pages(folio); /* This page is really still in write-back - just that the * writeback is happening on the server now. */ node_stat_mod_folio(folio, NR_WRITEBACK, nr); wb_stat_mod(&inode_to_bdi(inode)->wb, WB_WRITEBACK, nr); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } /* * Determine the number of bytes of data the page contains */ static inline size_t nfs_folio_length(struct folio *folio) { loff_t i_size = i_size_read(folio->mapping->host); if (i_size > 0) { pgoff_t index = folio->index >> folio_order(folio); pgoff_t end_index = (i_size - 1) >> folio_shift(folio); if (index < end_index) return folio_size(folio); if (index == end_index) return offset_in_folio(folio, i_size - 1) + 1; } return 0; } /* * Convert a umode to a dirent->d_type */ static inline unsigned char nfs_umode_to_dtype(umode_t mode) { return (mode >> 12) & 15; } /* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ static inline unsigned int nfs_page_array_len(unsigned int base, size_t len) { return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } /* * Convert a struct timespec64 into a 64-bit change attribute * * This does approximately the same thing as timespec64_to_ns(), * but for calculation efficiency, we multiply the seconds by * 1024*1024*1024. */ static inline u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) { return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } #ifdef CONFIG_CRC32 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } #else static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) { return 0; } #endif static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: case -EINTR: case -EACCES: case -EDQUOT: case -EFBIG: case -EIO: case -ENOSPC: case -EROFS: case -ESTALE: case -E2BIG: case -ENOMEM: case -ETIMEDOUT: return true; default: return false; } } static inline bool nfs_error_is_fatal_on_server(int err) { switch (err) { case 0: case -ERESTARTSYS: case -EINTR: case -ENOMEM: return false; } return nfs_error_is_fatal(err); } /* * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ static inline void nfs_set_port(struct sockaddr_storage *sap, int *port, const unsigned short default_port) { if (*port == NFS_UNSPEC_PORT) *port = default_port; rpc_set_port((struct sockaddr *)sap, *port); } struct nfs_direct_req { struct kref kref; /* release manager */ /* I/O parameters */ struct nfs_open_context *ctx; /* file open context info */ struct nfs_lock_context *l_ctx; /* Lock context info */ struct kiocb * iocb; /* controlling i/o request */ struct inode * inode; /* target file of i/o */ /* completion state */ atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ /* commit state */ struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ struct work_struct work; int flags; /* for write */ #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ /* for read */ #define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ #define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ };
16 18 178 180 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; timer-related code * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/times.h> #include "br_private.h" #include "br_private_stp.h" /* called under bridge lock */ static int br_is_designated_for_some_port(const struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && !memcmp(&p->designated_bridge, &br->bridge_id, 8)) return 1; } return 0; } static void br_hello_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, hello_timer); br_debug(br, "hello timer expired\n"); spin_lock(&br->lock); if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); } spin_unlock(&br->lock); } static void br_message_age_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, message_age_timer); struct net_bridge *br = p->br; const bridge_id *id = &p->designated_bridge; int was_root; if (p->state == BR_STATE_DISABLED) return; br_info(br, "port %u(%s) neighbor %.2x%.2x.%pM lost\n", (unsigned int) p->port_no, p->dev->name, id->prio[0], id->prio[1], &id->addr); /* * According to the spec, the message age timer cannot be * running when we are the root bridge. So.. this was_root * check is redundant. I'm leaving it in for now, though. */ spin_lock(&br->lock); if (p->state == BR_STATE_DISABLED) goto unlock; was_root = br_is_root_bridge(br); br_become_designated_port(p); br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !was_root) br_become_root_bridge(br); unlock: spin_unlock(&br->lock); } static void br_forward_delay_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, forward_delay_timer); struct net_bridge *br = p->br; br_debug(br, "port %u(%s) forward delay timer\n", (unsigned int) p->port_no, p->dev->name); spin_lock(&br->lock); if (p->state == BR_STATE_LISTENING) { br_set_state(p, BR_STATE_LEARNING); mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } else if (p->state == BR_STATE_LEARNING) { br_set_state(p, BR_STATE_FORWARDING); if (br_is_designated_for_some_port(br)) br_topology_change_detection(br); netif_carrier_on(br->dev); } rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, NULL, p); rcu_read_unlock(); spin_unlock(&br->lock); } static void br_tcn_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, tcn_timer); br_debug(br, "tcn timer expired\n"); spin_lock(&br->lock); if (!br_is_root_bridge(br) && (br->dev->flags & IFF_UP)) { br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } spin_unlock(&br->lock); } static void br_topology_change_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, topology_change_timer); br_debug(br, "topo change timer expired\n"); spin_lock(&br->lock); br->topology_change_detected = 0; __br_set_topology_change(br, 0); spin_unlock(&br->lock); } static void br_hold_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, hold_timer); br_debug(p->br, "port %u(%s) hold timer expired\n", (unsigned int) p->port_no, p->dev->name); spin_lock(&p->br->lock); if (p->config_pending) br_transmit_config(p); spin_unlock(&p->br->lock); } void br_stp_timer_init(struct net_bridge *br) { timer_setup(&br->hello_timer, br_hello_timer_expired, 0); timer_setup(&br->tcn_timer, br_tcn_timer_expired, 0); timer_setup(&br->topology_change_timer, br_topology_change_timer_expired, 0); } void br_stp_port_timer_init(struct net_bridge_port *p) { timer_setup(&p->message_age_timer, br_message_age_timer_expired, 0); timer_setup(&p->forward_delay_timer, br_forward_delay_timer_expired, 0); timer_setup(&p->hold_timer, br_hold_timer_expired, 0); } /* Report ticks left (in USER_HZ) used for API */ unsigned long br_timer_value(const struct timer_list *timer) { return timer_pending(timer) ? jiffies_delta_to_clock_t(timer->expires - jiffies) : 0; }
642 617 642 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 // SPDX-License-Identifier: GPL-2.0 struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; struct io_ring_ctx *ctx; }; int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx); void io_uring_clean_tctx(struct io_uring_task *tctx); void io_uring_unreg_ringfd(void); int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args); int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args); /* * Note that this task has used io_uring. We use it for cancelation purposes. */ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; if (likely(tctx && tctx->last == ctx)) return 0; return __io_uring_add_tctx_node_from_submit(ctx); }
2 1315 1315 1316 1314 1318 204 203 190 190 184 1311 460 1318 1315 1312 1 1312 1317 1319 1315 1315 1318 1312 1315 1313 1314 1310 1317 1311 1315 189 189 186 184 186 185 184 186 184 183 1 1 5 164 3 184 182 123 89 125 164 3 164 85 186 186 164 126 140 5 139 80 170 131 85 169 186 190 188 189 203 205 204 204 204 200 200 195 189 196 190 188 199 93 56 93 55 92 206 208 208 207 206 206 199 199 2 195 191 189 186 186 95 183 169 170 78 22 203 3 2 1 1 1 1 1 1 1 1 1 1 4 3 2 2 1 1 4 1 15 1 14 1 13 13 13 14 15 209 208 207 208 207 207 207 207 15 15 14 202 4 4 2 2 1 1 1 1 2 4 212 218 218 210 210 212 15 14 3 15 4 4 7 1 7 6 1 5 3 3 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 // SPDX-License-Identifier: GPL-2.0-only /* * Packet matching code. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> * Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cache.h> #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/netdevice.h> #include <linux/module.h> #include <net/ip.h> #include <net/compat.h> #include <linux/uaccess.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/err.h> #include <linux/cpumask.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <net/netfilter/nf_log.h> #include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv4 packet filter"); void *ipt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ipt, IPT); } EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool ip_packet_match(const struct iphdr *ip, const char *indev, const char *outdev, const struct ipt_ip *ipinfo, int isfrag) { unsigned long ret; if (NF_INVF(ipinfo, IPT_INV_SRCIP, (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) || NF_INVF(ipinfo, IPT_INV_DSTIP, (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr)) return false; ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0)) return false; /* Check specific protocol */ if (ipinfo->proto && NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto)) return false; /* If we have a fragment rule but the packet is not a fragment * then we return zero */ if (NF_INVF(ipinfo, IPT_INV_FRAG, (ipinfo->flags & IPT_F_FRAG) && !isfrag)) return false; return true; } static bool ip_checkentry(const struct ipt_ip *ip) { if (ip->flags & ~IPT_F_MASK) return false; if (ip->invflags & ~IPT_INV_MASK) return false; return true; } static unsigned int ipt_error(struct sk_buff *skb, const struct xt_action_param *par) { net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); return NF_DROP; } /* Performance critical */ static inline struct ipt_entry * get_entry(const void *base, unsigned int offset) { return (struct ipt_entry *)(base + offset); } /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ static inline bool unconditional(const struct ipt_entry *e) { static const struct ipt_ip uncond; return e->target_offset == sizeof(struct ipt_entry) && memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; } /* for const-correctness */ static inline const struct xt_entry_target * ipt_get_target_c(const struct ipt_entry *e) { return ipt_get_target((struct ipt_entry *)e); } #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) static const char *const hooknames[] = { [NF_INET_PRE_ROUTING] = "PREROUTING", [NF_INET_LOCAL_IN] = "INPUT", [NF_INET_FORWARD] = "FORWARD", [NF_INET_LOCAL_OUT] = "OUTPUT", [NF_INET_POST_ROUTING] = "POSTROUTING", }; enum nf_ip_trace_comments { NF_IP_TRACE_COMMENT_RULE, NF_IP_TRACE_COMMENT_RETURN, NF_IP_TRACE_COMMENT_POLICY, }; static const char *const comments[] = { [NF_IP_TRACE_COMMENT_RULE] = "rule", [NF_IP_TRACE_COMMENT_RETURN] = "return", [NF_IP_TRACE_COMMENT_POLICY] = "policy", }; static const struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { .level = 4, .logflags = NF_LOG_DEFAULT_MASK, }, }, }; /* Mildly perf critical (only if packet tracing is on) */ static inline int get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { const struct xt_standard_target *t = (void *)ipt_get_target_c(s); if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; } else if (s == e) { (*rulenum)++; if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] : comments[NF_IP_TRACE_COMMENT_RETURN]; } return 1; } else (*rulenum)++; return 0; } static void trace_packet(struct net *net, const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, const char *tablename, const struct xt_table_info *private, const struct ipt_entry *e) { const struct ipt_entry *root; const char *hookname, *chainname, *comment; const struct ipt_entry *iter; unsigned int rulenum = 0; root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP_TRACE_COMMENT_RULE]; xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) if (get_chainname_rulenum(iter, e, hookname, &chainname, &comment, &rulenum) != 0) break; nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } #endif static inline struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) { return (void *)entry + entry->next_offset; } /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ipt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; const void *table_base; struct ipt_entry *e, **jumpstack; unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ stackidx = 0; ip = ip_hdr(skb); indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET; acpar.thoff = ip_hdrlen(skb); acpar.hotdrop = false; acpar.state = state; WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; /* Switch to alternate jumpstack if we're being invoked via TEE. * TEE issues XT_CONTINUE verdict on original skb so we must not * clobber the jumpstack. * * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; struct xt_counters *counter; WARN_ON(!e); if (!ip_packet_match(ip, indev, outdev, &e->ip, acpar.fragoff)) { no_match: e = ipt_next_entry(e); continue; } xt_ematch_foreach(ematch, e) { acpar.match = ematch->u.kernel.match; acpar.matchinfo = ematch->data; if (!acpar.match->match(skb, &acpar)) goto no_match; } counter = xt_get_this_cpu_counter(&e->counters); ADD_COUNTER(*counter, skb->len, 1); t = ipt_get_target_c(e); WARN_ON(!t->u.kernel.target); #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) trace_packet(state->net, skb, hook, state->in, state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { int v; v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { verdict = (unsigned int)(-v) - 1; break; } if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); } else { e = jumpstack[--stackidx]; e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && !(e->ip.flags & IPT_F_GOTO)) { if (unlikely(stackidx >= private->stacksize)) { verdict = NF_DROP; break; } jumpstack[stackidx++] = e; } e = get_entry(table_base, v); continue; } acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); if (verdict == XT_CONTINUE) { /* Target might have changed stuff. */ ip = ip_hdr(skb); e = ipt_next_entry(e); } else { /* Verdict */ break; } } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); if (acpar.hotdrop) return NF_DROP; else return verdict; } /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0, unsigned int *offsets) { unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset to 0 as we leave), and comefrom to save source hook bitmask */ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; /* Set initial back pointer. */ e->counters.pcnt = pos; for (;;) { const struct xt_standard_target *t = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && t->verdict < 0) || visited) { unsigned int oldpos, size; /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; /* We're at the start. */ if (pos == oldpos) goto next; e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; pos += size; } else { int newpos = t->verdict; if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } } next: ; } return 1; } static void cleanup_match(struct xt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV4; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); } static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; par->match = m->u.kernel.match; par->matchinfo = m->data; return xt_check_match(par, m->u.match_size - sizeof(*m), ip->proto, ip->invflags & IPT_INV_PROTO); } static int find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); if (IS_ERR(match)) return PTR_ERR(match); m->u.kernel.match = match; ret = check_match(m, par); if (ret) goto err; return 0; err: module_put(m->u.kernel.match->me); return ret; } static int check_target(struct ipt_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = ipt_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, .family = NFPROTO_IPV4, }; return xt_check_target(&par, t->u.target_size - sizeof(*t), e->ip.proto, e->ip.invflags & IPT_INV_PROTO); } static int find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; j = 0; memset(&mtpar, 0, sizeof(mtpar)); mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV4; xt_ematch_foreach(ematch, e) { ret = find_check_match(ematch, &mtpar); if (ret != 0) goto cleanup_matches; ++j; } t = ipt_get_target(e); target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto cleanup_matches; } t->u.kernel.target = target; ret = check_target(e, net, name); if (ret) goto err; return 0; err: module_put(t->u.kernel.target->me); cleanup_matches: xt_ematch_foreach(ematch, e) { if (j-- == 0) break; cleanup_match(ematch, net); } xt_percpu_counter_free(&e->counters); return ret; } static bool check_underflow(const struct ipt_entry *e) { const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(e)) return false; t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } static int check_entry_size_and_hooks(struct ipt_entry *e, struct xt_table_info *newinfo, const unsigned char *base, const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, unsigned int valid_hooks) { unsigned int h; int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; if (!ip_checkentry(&e->ip)) return -EINVAL; err = xt_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (err) return err; /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) return -EINVAL; newinfo->underflow[h] = underflows[h]; } } /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; return 0; } static void cleanup_entry(struct ipt_entry *e, struct net *net) { struct xt_tgdtor_param par; struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ xt_ematch_foreach(ematch, e) cleanup_match(ematch, net); t = ipt_get_target(e); par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV4; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ipt_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct ipt_entry *iter; unsigned int *offsets; unsigned int i; int ret = 0; newinfo->size = repl->size; newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } offsets = xt_alloc_entry_offsets(newinfo->number); if (!offsets) return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { ret = check_entry_size_and_hooks(iter, newinfo, entry0, entry0 + repl->size, repl->hook_entry, repl->underflow, repl->valid_hooks); if (ret != 0) goto out_free; if (i < repl->num_entries) offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ipt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } ret = -EINVAL; if (i != repl->num_entries) goto out_free; ret = xt_check_table_hooks(newinfo, repl->valid_hooks); if (ret) goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; goto out_free; } kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; ++i; } if (ret != 0) { xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0) break; cleanup_entry(iter, net); } return ret; } return ret; out_free: kvfree(offsets); return ret; } static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct ipt_entry *iter; unsigned int cpu; unsigned int i; for_each_possible_cpu(cpu) { seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); bcnt = tmp->bcnt; pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; /* macro does multi eval of i */ cond_resched(); } } } static void get_old_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct ipt_entry *iter; unsigned int cpu, i; for_each_possible_cpu(cpu) { i = 0; xt_entry_foreach(iter, t->entries, t->size) { const struct xt_counters *tmp; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); ++i; /* macro does multi eval of i */ } cond_resched(); } } static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); get_counters(private, counters); return counters; } static int copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { unsigned int off, num; const struct ipt_entry *e; struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); loc_cpu_entry = private->entries; /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; const struct xt_entry_match *m; const struct xt_entry_target *t; e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; } if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], sizeof(counters[num])) != 0) { ret = -EFAULT; goto free_counters; } for (i = sizeof(struct ipt_entry); i < e->target_offset; i += m->u.match_size) { m = (void *)e + i; if (xt_match_to_user(m, userptr + off + i)) { ret = -EFAULT; goto free_counters; } } t = ipt_get_target_c(e); if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } } free_counters: vfree(counters); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; if (v > 0) v += xt_compat_calc_jump(AF_INET, v); memcpy(dst, &v, sizeof(v)); } static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; if (cv > 0) cv -= xt_compat_calc_jump(AF_INET, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } static int compat_calc_entry(const struct ipt_entry *e, const struct xt_table_info *info, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_match *ematch; const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - base; xt_ematch_foreach(ematch, e) off += xt_compat_match_offset(ematch->u.kernel.match); t = ipt_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(AF_INET, entry_offset, off); if (ret) return ret; for (i = 0; i < NF_INET_NUMHOOKS; i++) { if (info->hook_entry[i] && (e < (struct ipt_entry *)(base + info->hook_entry[i]))) newinfo->hook_entry[i] -= off; if (info->underflow[i] && (e < (struct ipt_entry *)(base + info->underflow[i]))) newinfo->underflow[i] -= off; } return 0; } static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ipt_entry *iter; const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; ret = xt_compat_init_offsets(AF_INET, info->number); if (ret) return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) return ret; } return 0; } #endif static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; if (*len != sizeof(struct ipt_getinfo)) return -EINVAL; if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(AF_INET); #endif t = xt_request_find_table_lock(net, AF_INET, name); if (!IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET); private = &tmp; } #endif memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); memcpy(info.underflow, private->underflow, sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; strcpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; xt_table_unlock(t); module_put(t->me); } else ret = PTR_ERR(t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(AF_INET); #endif return ret; } static int get_entries(struct net *net, struct ipt_get_entries __user *uptr, const int *len) { int ret; struct ipt_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct ipt_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else ret = -EAGAIN; module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); return ret; } static int __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, unsigned int num_counters, void __user *counters_ptr) { int ret; struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; struct ipt_entry *iter; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; } t = xt_request_find_table_lock(net, AF_INET, name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } /* You lied! */ if (valid_hooks != t->valid_hooks) { ret = -EINVAL; goto put_module; } oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; /* Update module usage count based on number of rules */ if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); if ((oldinfo->number > oldinfo->initial_entries) && (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); xt_table_unlock(t); get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) { /* Silent error, can't fail, new table is already in place */ net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); } vfree(counters); return 0; put_module: module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: vfree(counters); out: return ret; } static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct ipt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct ipt_entry *iter; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; if ((u64)len < (u64)tmp.size + sizeof(tmp)) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; struct xt_table *t; const struct xt_table_info *private; int ret = 0; struct ipt_entry *iter; unsigned int addend; paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET, tmp.name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free; } local_bh_disable(); private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: vfree(paddc); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct xt_counters * */ struct compat_ipt_entry entries[]; }; static int compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, unsigned int i) { struct xt_entry_target *t; struct compat_ipt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; const struct xt_entry_match *ematch; int ret = 0; origsize = *size; ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) return -EFAULT; *dstptr += sizeof(struct compat_ipt_entry); *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); xt_ematch_foreach(ematch, e) { ret = xt_compat_match_to_user(ematch, dstptr, size); if (ret != 0) return ret; } target_offset = e->target_offset - (origsize - *size); t = ipt_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) return ret; next_offset = e->next_offset - (origsize - *size); if (put_user(target_offset, &ce->target_offset) != 0 || put_user(next_offset, &ce->next_offset) != 0) return -EFAULT; return 0; } static int compat_find_calc_match(struct xt_entry_match *m, const struct ipt_ip *ip, int *size) { struct xt_match *match; match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); if (IS_ERR(match)) return PTR_ERR(match); m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; } static void compat_release_entry(struct compat_ipt_entry *e) { struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ xt_ematch_foreach(ematch, e) module_put(ematch->u.kernel.match->me); t = compat_ipt_get_target(e); module_put(t->u.kernel.target->me); } static int check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, struct xt_table_info *newinfo, unsigned int *size, const unsigned char *base, const unsigned char *limit) { struct xt_entry_match *ematch; struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; int ret, off; if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct compat_ipt_entry) + sizeof(struct compat_xt_entry_target)) return -EINVAL; if (!ip_checkentry(&e->ip)) return -EINVAL; ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (ret) return ret; off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { ret = compat_find_calc_match(ematch, &e->ip, &off); if (ret != 0) goto release_matches; ++j; } t = compat_ipt_get_target(e); target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto release_matches; } t->u.kernel.target = target; off += xt_compat_target_offset(target); *size += off; ret = xt_compat_add_offset(AF_INET, entry_offset, off); if (ret) goto out; return 0; out: module_put(t->u.kernel.target->me); release_matches: xt_ematch_foreach(ematch, e) { if (j-- == 0) break; module_put(ematch->u.kernel.match->me); } return ret; } static void compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, unsigned int *size, struct xt_table_info *newinfo, unsigned char *base) { struct xt_entry_target *t; struct ipt_entry *de; unsigned int origsize; int h; struct xt_entry_match *ematch; origsize = *size; de = *dstptr; memcpy(de, e, sizeof(struct ipt_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); *dstptr += sizeof(struct ipt_entry); *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); xt_ematch_foreach(ematch, e) xt_compat_match_from_user(ematch, dstptr, size); de->target_offset = e->target_offset - (origsize - *size); t = compat_ipt_get_target(e); xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)de - base < newinfo->hook_entry[h]) newinfo->hook_entry[h] -= origsize - *size; if ((unsigned char *)de - base < newinfo->underflow[h]) newinfo->underflow[h] -= origsize - *size; } } static int translate_compat_table(struct net *net, struct xt_table_info **pinfo, void **pentry0, const struct compat_ipt_replace *compatr) { unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; struct compat_ipt_entry *iter0; struct ipt_replace repl; unsigned int size; int ret; info = *pinfo; entry0 = *pentry0; size = compatr->size; info->number = compatr->num_entries; j = 0; xt_compat_lock(AF_INET); ret = xt_compat_init_offsets(AF_INET, compatr->num_entries); if (ret) goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, entry0, entry0 + compatr->size); if (ret != 0) goto out_unlock; ++j; } ret = -EINVAL; if (j != compatr->num_entries) goto out_unlock; ret = -ENOMEM; newinfo = xt_alloc_table_info(size); if (!newinfo) goto out_unlock; memset(newinfo->entries, 0, size); newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; size = compatr->size; xt_entry_foreach(iter0, entry0, compatr->size) compat_copy_entry_from_user(iter0, &pos, &size, newinfo, entry1); /* all module references in entry0 are now gone. * entry1/newinfo contains a 64bit ruleset that looks exactly as * generated by 64bit userspace. * * Call standard translate_table() to validate all hook_entrys, * underflows, check for loops, etc. */ xt_compat_flush_offsets(AF_INET); xt_compat_unlock(AF_INET); memcpy(&repl, compatr, sizeof(*compatr)); for (i = 0; i < NF_INET_NUMHOOKS; i++) { repl.hook_entry[i] = newinfo->hook_entry[i]; repl.underflow[i] = newinfo->underflow[i]; } repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); return 0; free_newinfo: xt_free_table_info(newinfo); return ret; out_unlock: xt_compat_flush_offsets(AF_INET); xt_compat_unlock(AF_INET); xt_entry_foreach(iter0, entry0, compatr->size) { if (j-- == 0) break; compat_release_entry(iter0); } return ret; } static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_ipt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct ipt_entry *iter; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; if ((u64)len < (u64)tmp.size + sizeof(tmp)) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } struct compat_ipt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ipt_entry entrytable[]; }; static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; unsigned int i = 0; struct ipt_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); pos = userptr; size = total_size; xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) break; } vfree(counters); return ret; } static int compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, int *len) { int ret; struct compat_ipt_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct compat_ipt_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); else if (!ret) ret = -EAGAIN; xt_compat_flush_offsets(AF_INET); module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); xt_compat_unlock(AF_INET); return ret; } #endif static int do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IPT_SO_SET_REPLACE: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else #endif ret = do_replace(sock_net(sk), arg, len); break; case IPT_SO_SET_ADD_COUNTERS: ret = do_add_counters(sock_net(sk), arg, len); break; default: ret = -EINVAL; } return ret; } static int do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IPT_SO_GET_INFO: ret = get_info(sock_net(sk), user, len); break; case IPT_SO_GET_ENTRIES: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else #endif ret = get_entries(sock_net(sk), user, len); break; case IPT_SO_GET_REVISION_MATCH: case IPT_SO_GET_REVISION_TARGET: { struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { ret = -EINVAL; break; } if (copy_from_user(&rev, user, sizeof(rev)) != 0) { ret = -EFAULT; break; } rev.name[sizeof(rev.name)-1] = 0; if (cmd == IPT_SO_GET_REVISION_TARGET) target = 1; else target = 0; try_then_request_module(xt_find_revision(AF_INET, rev.name, rev.revision, target, &ret), "ipt_%s", rev.name); break; } default: ret = -EINVAL; } return ret; } static void __ipt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; struct ipt_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); } int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *template_ops) { struct nf_hook_ops *ops; unsigned int num_ops; int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) { xt_free_table_info(newinfo); return ret; } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ipt_entry *iter; xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } /* No template? No need to do anything. This is used by 'nat' table, it registers * with the nat core instead of the netfilter core. */ if (!template_ops) return 0; num_ops = hweight32(table->valid_hooks); if (num_ops == 0) { ret = -EINVAL; goto out_free; } ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; } for (i = 0; i < num_ops; i++) ops[i].priv = new_table; new_table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); if (ret != 0) goto out_free; return ret; out_free: __ipt_unregister_table(net, new_table); return ret; } void ipt_unregister_table_pre_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); if (table) nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ipt_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); if (table) __ipt_unregister_table(net, table); } static struct xt_target ipt_builtin_tg[] __read_mostly = { { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV4, #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, #endif }, { .name = XT_ERROR_TARGET, .target = ipt_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV4, }, }; static struct nf_sockopt_ops ipt_sockopts = { .pf = PF_INET, .set_optmin = IPT_BASE_CTL, .set_optmax = IPT_SO_SET_MAX+1, .set = do_ipt_set_ctl, .get_optmin = IPT_BASE_CTL, .get_optmax = IPT_SO_GET_MAX+1, .get = do_ipt_get_ctl, .owner = THIS_MODULE, }; static int __net_init ip_tables_net_init(struct net *net) { return xt_proto_init(net, NFPROTO_IPV4); } static void __net_exit ip_tables_net_exit(struct net *net) { xt_proto_fini(net, NFPROTO_IPV4); } static struct pernet_operations ip_tables_net_ops = { .init = ip_tables_net_init, .exit = ip_tables_net_exit, }; static int __init ip_tables_init(void) { int ret; ret = register_pernet_subsys(&ip_tables_net_ops); if (ret < 0) goto err1; /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); if (ret < 0) goto err2; /* Register setsockopt */ ret = nf_register_sockopt(&ipt_sockopts); if (ret < 0) goto err4; return 0; err4: xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); err2: unregister_pernet_subsys(&ip_tables_net_ops); err1: return ret; } static void __exit ip_tables_fini(void) { nf_unregister_sockopt(&ipt_sockopts); xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg)); unregister_pernet_subsys(&ip_tables_net_ops); } EXPORT_SYMBOL(ipt_register_table); EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); module_init(ip_tables_init); module_exit(ip_tables_fini);
9762 2187 2016 9755 9866 50 3227 6215 979 61 932 980 978 46 61 46 67 6177 6184 266 185 6105 3227 3530 5162 67 5169 5159 2829 5167 5158 2953 2951 765 2953 219 219 89 219 143 144 73 73 73 20 20 20 70 71 29 71 2776 2775 2774 2779 2774 2536 6286 2536 133 123 287 2973 2967 1964 1 6244 589 6245 173 172 6250 6241 5033 1941 6174 5888 1098 6178 2972 6177 2974 6178 6177 6192 2885 6244 6258 6245 7711 3994 3994 2242 4025 3139 2953 2365 715 85 6 6 85 1 1 14161 7702 7671 7691 3011 6211 14266 14184 14179 12976 6193 6200 183 178 10 73 73 50 73 2042 2042 2039 1220 1223 1663 205 144 172 190 22 119 119 117 3 307 141 141 141 306 3 3 3 2 3 3 141 35 141 2420 143 8 1 141 141 21 21 21 20 21 21 21 21 50 50 50 241 241 240 241 239 1098 1098 1093 1093 84 84 20 84 10 84 1091 7 10 10 10 8 8 1096 1094 1094 13 13 4 2 13 2037 1900 1 1899 2037 1873 2037 2038 2036 2039 1083 83 83 71 26 83 71 1083 1083 71 1083 70 160 160 159 158 160 159 846 1 845 848 915 2 915 847 847 845 1 847 1 1 13642 13641 3131 13480 592 592 13326 13488 13663 10170 13644 13643 7451 7456 7466 7459 652 16 16 16 16 9454 9455 4534 9446 385 13648 13641 13639 13022 1207 13025 1142 13032 3673 13032 369 13032 5241 13027 610 13026 360 13110 13108 5779 5781 5773 12480 8884 8632 12288 13129 2 12045 12043 42 12060 12064 11819 11818 11827 827 829 831 832 2557 2557 2543 2543 81 76 53 53 45 46 46 45 45 46 46 53 81 81 2128 2130 2128 779 263 264 220 219 220 217 778 11613 779 11326 9808 9375 9379 9391 11619 7130 6791 6995 7416 3045 3043 2117 2118 2117 4 7413 191 191 175 440 175 298 297 7290 7301 171 6106 6100 6117 13 13 13 6252 6252 6248 14 1 14 14 6250 6253 6253 6244 6246 18 13 13 13 13 13 13 13 13 10 10 6253 6245 6258 1084 3 6242 6248 6236 6239 6238 254 894 7259 6092 6105 6094 2116 3019 3023 3027 5422 6104 894 3023 4119 409 4124 3 1 2 52 2 50 54 520 50 521 101 440 521 4 534 555 554 555 555 555 555 13 541 91 495 494 69 554 21 21 21 555 536 554 534 553 521 520 506 54 54 54 54 54 553 541 555 555 555 555 21 552 156 554 540 554 501 53 54 53 54 54 697 692 9 2 1 1 7 1 1 1 4521 4442 4442 2726 1648 26 26 3 3 23 9 9 9 1 9 14 26 4433 4525 375 374 298 298 298 298 298 375 40 25 40 25 40 280 280 280 279 452 450 451 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" #include <asm/runtime-const.h> /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); const struct qstr dotdot_name = QSTR_INIT("..", 2); EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. * * Marking the variables "used" ensures that the compiler doesn't * optimize them away completely on architectures with runtime * constant infrastructure, this allows debuggers to see their * values. But updating these values has no effect on those arches. */ static unsigned int d_hash_shift __ro_after_init __used; static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { return runtime_const_ptr(dentry_hashtable) + runtime_const_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ static struct dentry_stat_t dentry_stat = { .age_limit = 45, }; /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, }; static int __init init_fs_dcache_sysctls(void) { register_sysctl_init("fs", fs_dcache_sysctls); return 0; } fs_initcall(init_fs_dcache_sysctls); #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } struct external_name { union { atomic_t count; struct rcu_head head; } u; unsigned char name[]; }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_iname; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { spin_lock(&dentry->d_lock); name->name = dentry->d_name; if (unlikely(dname_external(dentry))) { atomic_inc(&external_name(dentry)->u.count); } else { memcpy(name->inline_name, dentry->d_iname, dentry->d_name.len + 1); name->name.name = name->inline_name; } spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; /* * The negative counter only tracks dentries on the LRU. Don't inc if * d_lru is on another list. */ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->u.count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_sib.next) { next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_sib.next = next->d_sib.next; } } static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); cond_resched(); /* now that it's negative, ->d_parent is stable */ if (!IS_ROOT(dentry)) { parent = dentry->d_parent; spin_lock(&parent->d_lock); } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_unlist(dentry); if (dentry->d_flags & DCACHE_SHRINK_LIST) can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); return NULL; } return parent; } /* * Lock a dentry for feeding it to __dentry_kill(). * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry is busy. Otherwise, return true and have * that dentry's inode locked. */ static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; if (unlikely(dentry->d_lockref.count)) return false; if (!inode || likely(spin_trylock(&inode->i_lock))) return true; do { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (likely(inode == dentry->d_inode)) break; spin_unlock(&inode->i_lock); inode = dentry->d_inode; } while (inode); if (likely(!dentry->d_lockref.count)) return true; if (inode) spin_unlock(&inode->i_lock); return false; } /* * Decide if dentry is worth retaining. Usually this is called with dentry * locked; if not locked, we are more limited and might not be able to tell * without a lock. False in this case means "punt to locked path and recheck". * * In case we aren't locked, these predicates are not "stable". However, it is * sufficient that at some point after we dropped the reference the dentry was * hashed and the flags had the proper value. Other dentry users may have * re-gotten a reference to the dentry and change that, but our work is done - * we can leave the dentry around with a zero refcount. */ static inline bool retain_dentry(struct dentry *dentry, bool locked) { unsigned int d_flags; smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; // Same if it's disconnected if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; // ->d_delete() might tell us not to bother, but that requires // ->d_lock; can't decide without it if (unlikely(d_flags & DCACHE_OP_DELETE)) { if (!locked || dentry->d_op->d_delete(dentry)) return false; } // Explicitly told not to bother if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; // At this point it looks like we ought to keep it. We also might // need to do something - put it on LRU if it wasn't there already // and mark it referenced if it was on LRU, but not marked yet. // Unfortunately, both actions require ->d_lock, so in lockless // case we'd have to punt rather than doing those. if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { if (!locked) return false; d_lru_add(dentry); } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { if (!locked) return false; dentry->d_flags |= DCACHE_REFERENCED; } return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode->i_state |= I_DONTCACHE; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * In that case refcount is guaranteed to be zero and we have already * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; /* * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } dentry->d_lockref.count--; goto locked; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Can we decide that decrement of refcount is all we needed without * taking the lock? There's a very common case when it's all we need - * dentry looks like it ought to be retained and there's nothing else * to do. */ if (retain_dentry(dentry, false)) return true; /* * Either not worth retaining or we can't tell without the lock. * Get the lock, then. We've already decremented the refcount to 0, * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ locked: if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } return false; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { if (!dentry) return; might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } while (lock_for_kill(dentry)) { rcu_read_unlock(); dentry = __dentry_kill(dentry); if (!dentry) return; if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } rcu_read_unlock(); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); lockref_get(&alias->d_lockref); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Caller MUST be holding rcu_read_lock() and be guaranteed * that inode won't get freed until rcu_read_unlock(). */ struct dentry *d_find_alias_rcu(struct inode *inode) { struct hlist_head *l = &inode->i_dentry; struct dentry *de = NULL; spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { hlist_for_each_entry(de, l, d_u.d_alias) if (!d_unhashed(de)) break; } } spin_unlock(&inode->i_lock); return de; } /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { LIST_HEAD(dispose); struct dentry *dentry; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { do { rcu_read_unlock(); victim = __dentry_kill(victim); rcu_read_lock(); } while (victim && lock_for_kill(victim)); rcu_read_unlock(); if (victim) spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!lock_for_kill(dentry)) { bool can_free; rcu_read_unlock(); d_shrink_del(dentry); can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } d_shrink_del(dentry); shrink_kill(dentry); } } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: dentry = d_first_child(this_parent); resume: hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ rcu_read_lock(); ascend: if (this_parent != parent) { dentry = this_parent; this_parent = dentry->d_parent; spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ hlist_for_each_entry_continue(dentry, d_sib) { if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { rcu_read_unlock(); goto resume; } } goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } struct check_mount { struct vfsmount *mnt; unsigned int mounted; }; static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; struct path path = { .mnt = info->mnt, .dentry = dentry }; if (likely(!d_mountpoint(dentry))) return D_WALK_CONTINUE; if (__path_is_mountpoint(&path)) { info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * path_has_submounts - check for mounts over a dentry in the * current namespace. * @parent: path to check. * * Return true if the parent or its subdirectories contain * a mount point in the current namespace. */ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; } EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } spin_unlock(&p->d_lock); } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { ret = -EBUSY; if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } spin_unlock(&dentry->d_lock); out: write_sequnlock(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; union { long found; struct dentry *victim; }; struct list_head dispose; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else if (!dentry->d_lockref.count) { to_shrink_list(dentry, &data->dispose); data->found++; } else if (dentry->d_lockref.count < 0) { data->found++; } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (!dentry->d_lockref.count) { if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune * * Prune the dcache to remove unused children of the parent dentry. */ void shrink_dcache_parent(struct dentry *parent) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); continue; } cond_resched(); if (!data.found) break; data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { spin_lock(&data.victim->d_lock); if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; rwsem_assert_held_write(&sb->s_umount); dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_roots)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) */ void d_invalidate(struct dentry *dentry) { bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) return; shrink_dcache_parent(dentry); for (;;) { struct dentry *victim = NULL; d_walk(dentry, &victim, find_submount); if (!victim) { if (had_submounts) shrink_dcache_parent(dentry); return; } had_submounts = true; detach_mounts(victim); dput(victim); } } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; int err; dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, GFP_KERNEL); if (!dentry) return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->u.count, 1); dname = p->name; } else { dname = dentry->d_iname; } dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_HLIST_NODE(&dentry->d_sib); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); if (err) { if (dname_external(dentry)) kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); return NULL; } } this_cpu_inc(nr_dentry); return dentry; } /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ dentry->d_parent = dget_dlock(parent); hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_anon(struct super_block *sb) { return __d_alloc(sb, NULL); } EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; } /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. * This is used for pipes, sockets et.al. - the stuff that should * never be anyone's children or parents. Unlike all other * dentries, these will not have RCU delay between dropping the * last reference and freeing them. * * The only user is alloc_file_pseudo() and that's what should * be considered a public interface. Don't use directly. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; if (!sb->s_d_op) d_set_d_op(dentry, &anon_ops); } return dentry; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) add_flags = DCACHE_AUTODIR_TYPE; else inode->i_opflags |= IOP_LOOKUP; } goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. */ if ((dentry->d_flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_instantiate); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else iput(root_inode); } return res; } EXPORT_SYMBOL(d_make_root); static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { struct super_block *sb; struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); sb = inode->i_sb; res = d_find_any_alias(inode); /* existing alias? */ if (res) goto out; new = d_alloc_anon(sb); if (!new) { res = ERR_PTR(-ENOMEM); goto out; } security_d_instantiate(new, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); /* recheck under lock */ if (likely(!res)) { /* still no alias, attach a disconnected dentry */ unsigned add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&new->d_lock); __d_set_inode_and_type(new, inode, add_flags); hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&sb->s_roots); hlist_bl_add_head(&new->d_hash, &sb->s_roots); hlist_bl_unlock(&sb->s_roots); } spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); inode = NULL; /* consumed by new->d_inode */ res = new; } else { spin_unlock(&inode->i_lock); dput(new); } out: iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the case-exact dentry * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (found) { iput(inode); return found; } if (d_in_lookup(dentry)) { found = d_alloc_parallel(dentry->d_parent, name, dentry->d_wait); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; } } else { found = d_alloc(dentry->d_parent, name); if (!found) { iput(inode); return ERR_PTR(-ENOMEM); } } res = d_splice_alias(inode, found); if (res) { d_lookup_done(found); dput(found); return res; } return found; } EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name * @parent: parent dentry * @dentry: the negative dentry that was passed to the parent's lookup func * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false */ bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) return false; return dentry_cmp(dentry, name->name, name->len) == 0; } return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } EXPORT_SYMBOL_GPL(d_same_name); /* * This is __d_lookup_rcu() when the parent dentry has * DCACHE_OP_COMPARE, which makes things much nastier. */ static noinline struct dentry *__d_lookup_rcu_op_compare( const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { int tlen; const char *tname; unsigned seq; seqretry: seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; tlen = dentry->d_name.len; tname = dentry->d_name.name; /* we want a consistent (name,len) pair */ if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); goto seqretry; } if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) return __d_lookup_rcu_op_compare(parent, name, seqp); /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. * * Note that raw_seqcount_begin still *does* smp_rmb(), so * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash_len != hashlen) continue; if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; } EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; if (!d_same_name(dentry, parent, name)) goto next; dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); /* * Are we the only user? */ if (dentry->d_lockref.count == 1) { dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) return n; cpu_relax(); } } static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) { if (d_in_lookup(dentry)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(dentry->d_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&dentry->d_lock); schedule(); spin_lock(&dentry->d_lock); } while (d_in_lookup(dentry)); } } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } if (read_seqcount_retry(&dentry->d_seq, d_seq)) { rcu_read_unlock(); dput(dentry); goto retry; } rcu_read_unlock(); dput(new); return dentry; } if (unlikely(read_seqretry(&rename_lock, r_seq))) { rcu_read_unlock(); goto retry; } if (unlikely(seq & 1)) { rcu_read_unlock(); goto retry; } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; } /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), * any potential in-lookup matches are going to stay here until * we unlock the chain. All fields are stable in everything * we encounter. */ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) continue; if (!d_same_name(dentry, parent, name)) continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; * wait for them to finish */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat * everything from dcache lookup, but it's likely to be what * d_lookup() would've found anyway. If it is, just return it; * otherwise we really have to repeat the whole thing. */ if (unlikely(dentry->d_name.hash != hash)) goto mismatch; if (unlikely(dentry->d_parent != parent)) goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; if (unlikely(!d_same_name(dentry, parent, name))) goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: spin_unlock(&dentry->d_lock); dput(dentry); goto retry; } EXPORT_SYMBOL(d_alloc_parallel); /* * - Unhash the dentry * - Retrieve and clear the waitqueue head in dentry * - Return the waitqueue head */ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) { wait_queue_head_t *d_wait; struct hlist_bl_head *b; lockdep_assert_held(&dentry->d_lock); b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } void __d_lookup_unhash_wake(struct dentry *dentry) { spin_lock(&dentry->d_lock); wake_up_all(__d_lookup_unhash(dentry)); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } __d_rehash(dentry); if (dir) end_dir_add(dir, n, d_wait); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); } /** * d_add - add dentry to hash queues * @entry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ void d_add(struct dentry *entry, struct inode *inode) { if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } __d_add(entry, inode); } EXPORT_SYMBOL(d_add); /** * d_exact_alias - find and hash an exact unhashed alias * @entry: dentry to add * @inode: The inode to go with this dentry * * If an unhashed dentry with the same name/parent and desired * inode already exists, hash and return it. Otherwise, return * NULL. * * Parent directory should be locked. */ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) { struct dentry *alias; unsigned int hash = entry->d_name.hash; spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or * parent changes, because the parent inode i_mutex is held. */ if (alias->d_name.hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; if (!d_same_name(alias, entry->d_parent, &entry->d_name)) continue; spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { spin_unlock(&alias->d_lock); alias = NULL; } else { dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); return alias; } spin_unlock(&inode->i_lock); return NULL; } EXPORT_SYMBOL(d_exact_alias); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ memcpy(target->d_iname, dentry->d_name.name, dentry->d_name.len + 1); dentry->d_name.name = target->d_name.name; target->d_name.name = target->d_iname; } } else { if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); target->d_name.name = dentry->d_name.name; dentry->d_name.name = dentry->d_iname; } else { /* * Both are internal. */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); } } } swap(dentry->d_name.hash_len, target->d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->u.count); dentry->d_name = target->d_name; } else { memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); dentry->d_name.name = dentry->d_iname; dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) kfree_rcu(old_name, u.head); } /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target)) return; BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent; p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, DENTRY_D_LOCK_NESTED); } spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ if (!d_unhashed(dentry)) ___d_drop(dentry); if (!d_unhashed(target)) ___d_drop(target); /* ... and switch them in the tree */ dentry->d_parent = target->d_parent; if (!exchange) { copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; swap_names(dentry, target); if (!hlist_unhashed(&target->d_sib)) __hlist_del(&target->d_sib); hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } if (!hlist_unhashed(&dentry->d_sib)) __hlist_del(&dentry->d_sib); hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n, d_wait); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); } /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); WARN_ON(IS_ROOT(dentry2)); __d_move(dentry1, dentry2, true); write_sequnlock(&rename_lock); } /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) return p; } return NULL; } /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: if (m2) up_read(m2); if (m1) mutex_unlock(m1); return ret; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { if (IS_ERR(inode)) return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); if (!inode) goto out; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); } iput(inode); return new; } } out: __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns true if new_dentry is a subdirectory of the parent (at any depth). * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { bool subdir; unsigned seq; if (new_dentry == old_dentry) return true; /* Access d_parent under rcu as d_move() may change it. */ rcu_read_lock(); seq = read_seqbegin(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); /* Try lockless once... */ if (read_seqretry(&rename_lock, seq)) { /* ...else acquire lock for progress even on deep chains. */ read_seqlock_excl(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); read_sequnlock_excl(&rename_lock); } rcu_read_unlock(); return subdir; } EXPORT_SYMBOL(is_subdir); static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } } return D_WALK_CONTINUE; } void d_genocide(struct dentry *parent) { d_walk(parent, parent, d_genocide_kill); } void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); } EXPORT_SYMBOL(d_mark_tmpfile); void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; inode_dec_link_count(inode); d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); /* * Obtain inode number of the parent dentry. */ ino_t d_parent_ino(struct dentry *dentry) { struct dentry *parent; struct inode *iparent; unsigned seq; ino_t ret; scoped_guard(rcu) { seq = raw_seqcount_begin(&dentry->d_seq); parent = READ_ONCE(dentry->d_parent); iparent = d_inode_rcu(parent); if (likely(iparent)) { ret = iparent->i_ino; if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; } } spin_lock(&dentry->d_lock); ret = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(d_parent_ino); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } static void __init dcache_init(void) { /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_iname); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) { int i; for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
1 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct stats_req_info { struct ethnl_req_info base; DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT); enum ethtool_mac_stats_src src; }; #define STATS_REQINFO(__req_base) \ container_of(__req_base, struct stats_req_info, base) struct stats_reply_data { struct ethnl_reply_data base; struct_group(stats, struct ethtool_eth_phy_stats phy_stats; struct ethtool_eth_mac_stats mac_stats; struct ethtool_eth_ctrl_stats ctrl_stats; struct ethtool_rmon_stats rmon_stats; ); const struct ethtool_rmon_hist_range *rmon_ranges; }; #define STATS_REPDATA(__reply_base) \ container_of(__reply_base, struct stats_reply_data, base) const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_PHY] = "eth-phy", [ETHTOOL_STATS_ETH_MAC] = "eth-mac", [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", [ETHTOOL_STATS_RMON] = "rmon", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier", }; const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT] = "FramesTransmittedOK", [ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL] = "SingleCollisionFrames", [ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL] = "MultipleCollisionFrames", [ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT] = "FramesReceivedOK", [ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR] = "FrameCheckSequenceErrors", [ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR] = "AlignmentErrors", [ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES] = "OctetsTransmittedOK", [ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER] = "FramesWithDeferredXmissions", [ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL] = "LateCollisions", [ETHTOOL_A_STATS_ETH_MAC_11_XS_COL] = "FramesAbortedDueToXSColls", [ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR] = "FramesLostDueToIntMACXmitError", [ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR] = "CarrierSenseErrors", [ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES] = "OctetsReceivedOK", [ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR] = "FramesLostDueToIntMACRcvError", [ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST] = "MulticastFramesXmittedOK", [ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST] = "BroadcastFramesXmittedOK", [ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER] = "FramesWithExcessiveDeferral", [ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST] = "MulticastFramesReceivedOK", [ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST] = "BroadcastFramesReceivedOK", [ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR] = "InRangeLengthErrors", [ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN] = "OutOfRangeLengthField", [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors", }; const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_ETH_CTRL_3_TX] = "MACControlFramesTransmitted", [ETHTOOL_A_STATS_ETH_CTRL_4_RX] = "MACControlFramesReceived", [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived", }; const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_RMON_UNDERSIZE] = "etherStatsUndersizePkts", [ETHTOOL_A_STATS_RMON_OVERSIZE] = "etherStatsOversizePkts", [ETHTOOL_A_STATS_RMON_FRAG] = "etherStatsFragments", [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", }; const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED }, [ETHTOOL_A_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int stats_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct stats_req_info *req_info = STATS_REQINFO(req_base); bool mod = false; int err; err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT, tb[ETHTOOL_A_STATS_GROUPS], stats_std_names, extack, &mod); if (err) return err; if (!mod) { NL_SET_ERR_MSG(extack, "no stats requested"); return -EINVAL; } if (tb[ETHTOOL_A_STATS_SRC]) src = nla_get_u32(tb[ETHTOOL_A_STATS_SRC]); req_info->src = src; return 0; } static int stats_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); struct stats_reply_data *data = STATS_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } /* Mark all stats as unset (see ETHTOOL_STAT_NOT_SET) to prevent them * from being reported to user space in case driver did not set them. */ memset(&data->stats, 0xff, sizeof(data->stats)); data->phy_stats.src = src; data->mac_stats.src = src; data->ctrl_stats.src = src; data->rmon_stats.src = src; if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) && dev->ethtool_ops->get_eth_mac_stats) dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats); if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) && dev->ethtool_ops->get_eth_ctrl_stats) dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats); if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) && dev->ethtool_ops->get_rmon_stats) dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats, &data->rmon_ranges); ethnl_ops_complete(dev); return 0; } static int stats_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); unsigned int n_grps = 0, n_stats = 0; int len = 0; len += nla_total_size(sizeof(u32)); /* _STATS_SRC */ if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); n_grps++; } if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64); n_grps++; } if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64); n_grps++; } if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) { n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64); n_grps++; /* Above includes the space for _A_STATS_GRP_HIST_VALs */ len += (nla_total_size(0) + /* _A_STATS_GRP_HIST */ nla_total_size(4) + /* _A_STATS_GRP_HIST_BKT_LOW */ nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */ ETHTOOL_RMON_HIST_MAX * 2; } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ nla_total_size(4)); /* _A_STATS_GRP_SS_ID */ len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */ nla_total_size_64bit(sizeof(u64))); return len; } static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val) { struct nlattr *nest; int ret; if (val == ETHTOOL_STAT_NOT_SET) return 0; /* We want to start stats attr types from 0, so we don't have a type * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the * actual attr we're 4B off - nla_need_padding_for_64bit() & co. * can't be used. */ #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0)) return -EMSGSIZE; #endif nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT); if (!nest) return -EMSGSIZE; ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */); if (ret) { nla_nest_cancel(skb, nest); return ret; } nla_nest_end(skb, nest); return 0; } static int stats_put_phy_stats(struct sk_buff *skb, const struct stats_reply_data *data) { if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, data->phy_stats.SymbolErrorDuringCarrier)) return -EMSGSIZE; return 0; } static int stats_put_mac_stats(struct sk_buff *skb, const struct stats_reply_data *data) { if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT, data->mac_stats.FramesTransmittedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL, data->mac_stats.SingleCollisionFrames) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL, data->mac_stats.MultipleCollisionFrames) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT, data->mac_stats.FramesReceivedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR, data->mac_stats.FrameCheckSequenceErrors) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR, data->mac_stats.AlignmentErrors) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES, data->mac_stats.OctetsTransmittedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER, data->mac_stats.FramesWithDeferredXmissions) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL, data->mac_stats.LateCollisions) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL, data->mac_stats.FramesAbortedDueToXSColls) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR, data->mac_stats.FramesLostDueToIntMACXmitError) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR, data->mac_stats.CarrierSenseErrors) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES, data->mac_stats.OctetsReceivedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR, data->mac_stats.FramesLostDueToIntMACRcvError) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST, data->mac_stats.MulticastFramesXmittedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST, data->mac_stats.BroadcastFramesXmittedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER, data->mac_stats.FramesWithExcessiveDeferral) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST, data->mac_stats.MulticastFramesReceivedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST, data->mac_stats.BroadcastFramesReceivedOK) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR, data->mac_stats.InRangeLengthErrors) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN, data->mac_stats.OutOfRangeLengthField) || stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR, data->mac_stats.FrameTooLongErrors)) return -EMSGSIZE; return 0; } static int stats_put_ctrl_stats(struct sk_buff *skb, const struct stats_reply_data *data) { if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX, data->ctrl_stats.MACControlFramesTransmitted) || stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX, data->ctrl_stats.MACControlFramesReceived) || stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP, data->ctrl_stats.UnsupportedOpcodesReceived)) return -EMSGSIZE; return 0; } static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist, const struct ethtool_rmon_hist_range *ranges) { struct nlattr *nest; int i; if (!ranges) return 0; for (i = 0; i < ETHTOOL_RMON_HIST_MAX; i++) { if (!ranges[i].low && !ranges[i].high) break; if (hist[i] == ETHTOOL_STAT_NOT_SET) continue; nest = nla_nest_start(skb, attr); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, ranges[i].low) || nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI, ranges[i].high) || nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL, hist[i], ETHTOOL_A_STATS_GRP_PAD)) goto err_cancel_hist; nla_nest_end(skb, nest); } return 0; err_cancel_hist: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int stats_put_rmon_stats(struct sk_buff *skb, const struct stats_reply_data *data) { if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX, data->rmon_stats.hist, data->rmon_ranges) || stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX, data->rmon_stats.hist_tx, data->rmon_ranges)) return -EMSGSIZE; if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE, data->rmon_stats.undersize_pkts) || stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE, data->rmon_stats.oversize_pkts) || stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG, data->rmon_stats.fragments) || stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER, data->rmon_stats.jabbers)) return -EMSGSIZE; return 0; } static int stats_put_stats(struct sk_buff *skb, const struct stats_reply_data *data, u32 id, u32 ss_id, int (*cb)(struct sk_buff *skb, const struct stats_reply_data *data)) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) || nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id)) goto err_cancel; if (cb(skb, data)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int stats_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct stats_req_info *req_info = STATS_REQINFO(req_base); const struct stats_reply_data *data = STATS_REPDATA(reply_base); int ret = 0; if (nla_put_u32(skb, ETHTOOL_A_STATS_SRC, req_info->src)) return -EMSGSIZE; if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, ETH_SS_STATS_ETH_PHY, stats_put_phy_stats); if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC, ETH_SS_STATS_ETH_MAC, stats_put_mac_stats); if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL, ETH_SS_STATS_ETH_CTRL, stats_put_ctrl_stats); if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON, ETH_SS_STATS_RMON, stats_put_rmon_stats); return ret; } const struct ethnl_request_ops ethnl_stats_request_ops = { .request_cmd = ETHTOOL_MSG_STATS_GET, .reply_cmd = ETHTOOL_MSG_STATS_GET_REPLY, .hdr_attr = ETHTOOL_A_STATS_HEADER, .req_info_size = sizeof(struct stats_req_info), .reply_data_size = sizeof(struct stats_reply_data), .parse_request = stats_parse_request, .prepare_data = stats_prepare_data, .reply_size = stats_reply_size, .fill_reply = stats_fill_reply, }; static u64 ethtool_stats_sum(u64 a, u64 b) { if (a == ETHTOOL_STAT_NOT_SET) return b; if (b == ETHTOOL_STAT_NOT_SET) return a; return a + b; } /* Avoid modifying the aggregation procedure every time a new counter is added * by treating the structures as an array of u64 statistics. */ static void ethtool_aggregate_stats(void *aggr_stats, const void *emac_stats, const void *pmac_stats, size_t stats_size, size_t stats_offset) { size_t num_stats = stats_size / sizeof(u64); const u64 *s1 = emac_stats + stats_offset; const u64 *s2 = pmac_stats + stats_offset; u64 *s = aggr_stats + stats_offset; int i; for (i = 0; i < num_stats; i++) s[i] = ethtool_stats_sum(s1[i], s2[i]); } void ethtool_aggregate_mac_stats(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_eth_mac_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_eth_mac_stats(dev, &emac); ops->get_eth_mac_stats(dev, &pmac); ethtool_aggregate_stats(mac_stats, &emac, &pmac, sizeof(mac_stats->stats), offsetof(struct ethtool_eth_mac_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_mac_stats); void ethtool_aggregate_phy_stats(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_eth_phy_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_eth_phy_stats(dev, &emac); ops->get_eth_phy_stats(dev, &pmac); ethtool_aggregate_stats(phy_stats, &emac, &pmac, sizeof(phy_stats->stats), offsetof(struct ethtool_eth_phy_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_phy_stats); void ethtool_aggregate_ctrl_stats(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_eth_ctrl_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_eth_ctrl_stats(dev, &emac); ops->get_eth_ctrl_stats(dev, &pmac); ethtool_aggregate_stats(ctrl_stats, &emac, &pmac, sizeof(ctrl_stats->stats), offsetof(struct ethtool_eth_ctrl_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_ctrl_stats); void ethtool_aggregate_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_pause_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_pause_stats(dev, &emac); ops->get_pause_stats(dev, &pmac); ethtool_aggregate_stats(pause_stats, &emac, &pmac, sizeof(pause_stats->stats), offsetof(struct ethtool_pause_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_pause_stats); void ethtool_aggregate_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats) { const struct ethtool_ops *ops = dev->ethtool_ops; const struct ethtool_rmon_hist_range *dummy; struct ethtool_rmon_stats pmac, emac; memset(&emac, 0xff, sizeof(emac)); memset(&pmac, 0xff, sizeof(pmac)); emac.src = ETHTOOL_MAC_STATS_SRC_EMAC; pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC; ops->get_rmon_stats(dev, &emac, &dummy); ops->get_rmon_stats(dev, &pmac, &dummy); ethtool_aggregate_stats(rmon_stats, &emac, &pmac, sizeof(rmon_stats->stats), offsetof(struct ethtool_rmon_stats, stats)); } EXPORT_SYMBOL(ethtool_aggregate_rmon_stats);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 // SPDX-License-Identifier: GPL-2.0-or-later /* * GeneSys GL620USB-A based links * Copyright (C) 2001 by Jiun-Jie Huang <huangjj@genesyslogic.com.tw> * Copyright (C) 2001 by Stanislav Brabec <utx@penguin.cz> */ // #define DEBUG // error path messages, extra info // #define VERBOSE // more; success messages #include <linux/module.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> #include <linux/gfp.h> /* * GeneSys GL620USB-A (www.genesyslogic.com.tw) * * ... should partially interop with the Win32 driver for this hardware. * The GeneSys docs imply there's some NDIS issue motivating this framing. * * Some info from GeneSys: * - GL620USB-A is full duplex; GL620USB is only half duplex for bulk. * (Some cables, like the BAFO-100c, use the half duplex version.) * - For the full duplex model, the low bit of the version code says * which side is which ("left/right"). * - For the half duplex type, a control/interrupt handshake settles * the transfer direction. (That's disabled here, partially coded.) * A control URB would block until other side writes an interrupt. * * Original code from Jiun-Jie Huang <huangjj@genesyslogic.com.tw> * and merged into "usbnet" by Stanislav Brabec <utx@penguin.cz>. */ // control msg write command #define GENELINK_CONNECT_WRITE 0xF0 // interrupt pipe index #define GENELINK_INTERRUPT_PIPE 0x03 // interrupt read buffer size #define INTERRUPT_BUFSIZE 0x08 // interrupt pipe interval value #define GENELINK_INTERRUPT_INTERVAL 0x10 // max transmit packet number per transmit #define GL_MAX_TRANSMIT_PACKETS 32 // max packet length #define GL_MAX_PACKET_LEN 1514 // max receive buffer size #define GL_RCV_BUF_SIZE \ (((GL_MAX_PACKET_LEN + 4) * GL_MAX_TRANSMIT_PACKETS) + 4) struct gl_packet { __le32 packet_length; char packet_data[]; }; struct gl_header { __le32 packet_count; struct gl_packet packets; }; static int genelink_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { struct gl_header *header; struct gl_packet *packet; struct sk_buff *gl_skb; u32 size; u32 count; /* This check is no longer done by usbnet */ if (skb->len < dev->net->hard_header_len) return 0; header = (struct gl_header *) skb->data; // get the packet count of the received skb count = le32_to_cpu(header->packet_count); if (count > GL_MAX_TRANSMIT_PACKETS) { netdev_dbg(dev->net, "genelink: invalid received packet count %u\n", count); return 0; } // set the current packet pointer to the first packet packet = &header->packets; // decrement the length for the packet count size 4 bytes skb_pull(skb, 4); while (count > 1) { // get the packet length size = le32_to_cpu(packet->packet_length); // this may be a broken packet if (size > GL_MAX_PACKET_LEN) { netdev_dbg(dev->net, "genelink: invalid rx length %d\n", size); return 0; } // allocate the skb for the individual packet gl_skb = alloc_skb(size, GFP_ATOMIC); if (gl_skb) { // copy the packet data to the new skb skb_put_data(gl_skb, packet->packet_data, size); usbnet_skb_return(dev, gl_skb); } // advance to the next packet packet = (struct gl_packet *)&packet->packet_data[size]; count--; // shift the data pointer to the next gl_packet skb_pull(skb, size + 4); } // skip the packet length field 4 bytes skb_pull(skb, 4); if (skb->len > GL_MAX_PACKET_LEN) { netdev_dbg(dev->net, "genelink: invalid rx length %d\n", skb->len); return 0; } return 1; } static struct sk_buff * genelink_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { int padlen; int length = skb->len; int headroom = skb_headroom(skb); int tailroom = skb_tailroom(skb); __le32 *packet_count; __le32 *packet_len; // FIXME: magic numbers, bleech padlen = ((skb->len + (4 + 4*1)) % 64) ? 0 : 1; if ((!skb_cloned(skb)) && ((headroom + tailroom) >= (padlen + (4 + 4*1)))) { if ((headroom < (4 + 4*1)) || (tailroom < padlen)) { skb->data = memmove(skb->head + (4 + 4*1), skb->data, skb->len); skb_set_tail_pointer(skb, skb->len); } } else { struct sk_buff *skb2; skb2 = skb_copy_expand(skb, (4 + 4*1) , padlen, flags); dev_kfree_skb_any(skb); skb = skb2; if (!skb) return NULL; } // attach the packet count to the header packet_count = skb_push(skb, (4 + 4 * 1)); packet_len = packet_count + 1; *packet_count = cpu_to_le32(1); *packet_len = cpu_to_le32(length); // add padding byte if ((skb->len % dev->maxpacket) == 0) skb_put(skb, 1); return skb; } static int genelink_bind(struct usbnet *dev, struct usb_interface *intf) { dev->hard_mtu = GL_RCV_BUF_SIZE; dev->net->hard_header_len += 4; dev->in = usb_rcvbulkpipe(dev->udev, dev->driver_info->in); dev->out = usb_sndbulkpipe(dev->udev, dev->driver_info->out); return 0; } static const struct driver_info genelink_info = { .description = "Genesys GeneLink", .flags = FLAG_POINTTOPOINT | FLAG_FRAMING_GL | FLAG_NO_SETINT, .bind = genelink_bind, .rx_fixup = genelink_rx_fixup, .tx_fixup = genelink_tx_fixup, .in = 1, .out = 2, #ifdef GENELINK_ACK .check_connect =genelink_check_connect, #endif }; static const struct usb_device_id products [] = { { USB_DEVICE(0x05e3, 0x0502), // GL620USB-A .driver_info = (unsigned long) &genelink_info, }, /* NOT: USB_DEVICE(0x05e3, 0x0501), // GL620USB * that's half duplex, not currently supported */ { }, // END }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver gl620a_driver = { .name = "gl620a", .id_table = products, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .disable_hub_initiated_lpm = 1, }; module_usb_driver(gl620a_driver); MODULE_AUTHOR("Jiun-Jie Huang"); MODULE_DESCRIPTION("GL620-USB-A Host-to-Host Link cables"); MODULE_LICENSE("GPL");
4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 5 5 5 5 5 5 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 /* gf128mul.c - GF(2^128) multiplication functions * * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org> * * Based on Dr Brian Gladman's (GPL'd) work published at * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php * See the original copyright notice below. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ /* --------------------------------------------------------------------------- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. LICENSE TERMS The free distribution and use of this software in both source and binary form is allowed (with or without changes) provided that: 1. distributions of this source code include the above copyright notice, this list of conditions and the following disclaimer; 2. distributions in binary form include the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other associated materials; 3. the copyright holder's name is not used to endorse products built using this software without specific written permission. ALTERNATIVELY, provided that this notice is retained in full, this product may be distributed under the terms of the GNU General Public License (GPL), in which case the provisions of the GPL apply INSTEAD OF those given above. DISCLAIMER This software is provided 'as is' with no explicit or implied warranties in respect of its properties, including, but not limited to, correctness and/or fitness for purpose. --------------------------------------------------------------------------- Issue 31/01/2006 This file provides fast multiplication in GF(2^128) as required by several cryptographic authentication modes */ #include <crypto/gf128mul.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #define gf128mul_dat(q) { \ q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\ q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\ q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\ q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\ q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\ q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\ q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\ q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\ q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\ q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\ q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\ q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\ q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\ q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\ q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\ q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\ q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\ q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\ q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\ q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\ q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\ q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\ q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\ q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\ q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\ q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\ q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\ q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\ q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\ q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\ q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\ q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \ } /* * Given a value i in 0..255 as the byte overflow when a field element * in GF(2^128) is multiplied by x^8, the following macro returns the * 16-bit value that must be XOR-ed into the low-degree end of the * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1. * * There are two versions of the macro, and hence two tables: one for * the "be" convention where the highest-order bit is the coefficient of * the highest-degree polynomial term, and one for the "le" convention * where the highest-order bit is the coefficient of the lowest-degree * polynomial term. In both cases the values are stored in CPU byte * endianness such that the coefficients are ordered consistently across * bytes, i.e. in the "be" table bits 15..0 of the stored value * correspond to the coefficients of x^15..x^0, and in the "le" table * bits 15..0 correspond to the coefficients of x^0..x^15. * * Therefore, provided that the appropriate byte endianness conversions * are done by the multiplication functions (and these must be in place * anyway to support both little endian and big endian CPUs), the "be" * table can be used for multiplications of both "bbe" and "ble" * elements, and the "le" table can be used for multiplications of both * "lle" and "lbe" elements. */ #define xda_be(i) ( \ (i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \ (i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \ (i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \ (i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \ ) #define xda_le(i) ( \ (i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \ (i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \ (i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \ (i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \ ) static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le); static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be); /* * The following functions multiply a field element by x^8 in * the polynomial field representation. They use 64-bit word operations * to gain speed but compensate for machine endianness and hence work * correctly on both styles of machine. */ static void gf128mul_x8_lle(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = gf128mul_table_le[b & 0xff]; x->b = cpu_to_be64((b >> 8) | (a << 56)); x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); } /* time invariant version of gf128mul_x8_lle */ static void gf128mul_x8_lle_ti(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = xda_le(b & 0xff); /* avoid table lookup */ x->b = cpu_to_be64((b >> 8) | (a << 56)); x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); } static void gf128mul_x8_bbe(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = gf128mul_table_be[a >> 56]; x->a = cpu_to_be64((a << 8) | (b >> 56)); x->b = cpu_to_be64((b << 8) ^ _tt); } void gf128mul_x8_ble(le128 *r, const le128 *x) { u64 a = le64_to_cpu(x->a); u64 b = le64_to_cpu(x->b); u64 _tt = gf128mul_table_be[a >> 56]; r->a = cpu_to_le64((a << 8) | (b >> 56)); r->b = cpu_to_le64((b << 8) ^ _tt); } EXPORT_SYMBOL(gf128mul_x8_ble); void gf128mul_lle(be128 *r, const be128 *b) { /* * The p array should be aligned to twice the size of its element type, * so that every even/odd pair is guaranteed to share a cacheline * (assuming a cacheline size of 32 bytes or more, which is by far the * most common). This ensures that each be128_xor() call in the loop * takes the same amount of time regardless of the value of 'ch', which * is derived from function parameter 'b', which is commonly used as a * key, e.g., for GHASH. The odd array elements are all set to zero, * making each be128_xor() a NOP if its associated bit in 'ch' is not * set, and this is equivalent to calling be128_xor() conditionally. * This approach aims to avoid leaking information about such keys * through execution time variances. * * Unfortunately, __aligned(16) or higher does not work on x86 for * variables on the stack so we need to perform the alignment by hand. */ be128 array[16 + 3] = {}; be128 *p = PTR_ALIGN(&array[0], 2 * sizeof(be128)); int i; p[0] = *r; for (i = 0; i < 7; ++i) gf128mul_x_lle(&p[2 * i + 2], &p[2 * i]); memset(r, 0, sizeof(*r)); for (i = 0;;) { u8 ch = ((u8 *)b)[15 - i]; be128_xor(r, r, &p[ 0 + !(ch & 0x80)]); be128_xor(r, r, &p[ 2 + !(ch & 0x40)]); be128_xor(r, r, &p[ 4 + !(ch & 0x20)]); be128_xor(r, r, &p[ 6 + !(ch & 0x10)]); be128_xor(r, r, &p[ 8 + !(ch & 0x08)]); be128_xor(r, r, &p[10 + !(ch & 0x04)]); be128_xor(r, r, &p[12 + !(ch & 0x02)]); be128_xor(r, r, &p[14 + !(ch & 0x01)]); if (++i >= 16) break; gf128mul_x8_lle_ti(r); /* use the time invariant version */ } } EXPORT_SYMBOL(gf128mul_lle); void gf128mul_bbe(be128 *r, const be128 *b) { be128 p[8]; int i; p[0] = *r; for (i = 0; i < 7; ++i) gf128mul_x_bbe(&p[i + 1], &p[i]); memset(r, 0, sizeof(*r)); for (i = 0;;) { u8 ch = ((u8 *)b)[i]; if (ch & 0x80) be128_xor(r, r, &p[7]); if (ch & 0x40) be128_xor(r, r, &p[6]); if (ch & 0x20) be128_xor(r, r, &p[5]); if (ch & 0x10) be128_xor(r, r, &p[4]); if (ch & 0x08) be128_xor(r, r, &p[3]); if (ch & 0x04) be128_xor(r, r, &p[2]); if (ch & 0x02) be128_xor(r, r, &p[1]); if (ch & 0x01) be128_xor(r, r, &p[0]); if (++i >= 16) break; gf128mul_x8_bbe(r); } } EXPORT_SYMBOL(gf128mul_bbe); /* This version uses 64k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key value in GF(2^128). If we consider a GF(2^128) value in the buffer's lowest byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. But we also need tables for each of the 16 higher bytes in the buffer as well, which makes 64 kbytes in total. */ /* additional explanation * t[0][BYTE] contains g*BYTE * t[1][BYTE] contains g*x^8*BYTE * .. * t[15][BYTE] contains g*x^120*BYTE */ struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g) { struct gf128mul_64k *t; int i, j, k; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out; for (i = 0; i < 16; i++) { t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL); if (!t->t[i]) { gf128mul_free_64k(t); t = NULL; goto out; } } t->t[0]->t[1] = *g; for (j = 1; j <= 64; j <<= 1) gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]); for (i = 0;;) { for (j = 2; j < 256; j += j) for (k = 1; k < j; ++k) be128_xor(&t->t[i]->t[j + k], &t->t[i]->t[j], &t->t[i]->t[k]); if (++i >= 16) break; for (j = 128; j > 0; j >>= 1) { t->t[i]->t[j] = t->t[i - 1]->t[j]; gf128mul_x8_bbe(&t->t[i]->t[j]); } } out: return t; } EXPORT_SYMBOL(gf128mul_init_64k_bbe); void gf128mul_free_64k(struct gf128mul_64k *t) { int i; for (i = 0; i < 16; i++) kfree_sensitive(t->t[i]); kfree_sensitive(t); } EXPORT_SYMBOL(gf128mul_free_64k); void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t) { u8 *ap = (u8 *)a; be128 r[1]; int i; *r = t->t[0]->t[ap[15]]; for (i = 1; i < 16; ++i) be128_xor(r, r, &t->t[i]->t[ap[15 - i]]); *a = *r; } EXPORT_SYMBOL(gf128mul_64k_bbe); /* This version uses 4k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key value in GF(2^128). If we consider a GF(2^128) value in a single byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. If we take the highest byte in the buffer and use this table to get the result, we then have to multiply by x^120 to get the final value. For the next highest byte the result has to be multiplied by x^112 and so on. But we can do this by accumulating the result in an accumulator starting with the result for the top byte. We repeatedly multiply the accumulator value by x^8 and then add in (i.e. xor) the 16 bytes of the next lower byte in the buffer, stopping when we reach the lowest byte. This requires a 4096 byte table. */ struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g) { struct gf128mul_4k *t; int j, k; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out; t->t[128] = *g; for (j = 64; j > 0; j >>= 1) gf128mul_x_lle(&t->t[j], &t->t[j+j]); for (j = 2; j < 256; j += j) for (k = 1; k < j; ++k) be128_xor(&t->t[j + k], &t->t[j], &t->t[k]); out: return t; } EXPORT_SYMBOL(gf128mul_init_4k_lle); struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g) { struct gf128mul_4k *t; int j, k; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out; t->t[1] = *g; for (j = 1; j <= 64; j <<= 1) gf128mul_x_bbe(&t->t[j + j], &t->t[j]); for (j = 2; j < 256; j += j) for (k = 1; k < j; ++k) be128_xor(&t->t[j + k], &t->t[j], &t->t[k]); out: return t; } EXPORT_SYMBOL(gf128mul_init_4k_bbe); void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t) { u8 *ap = (u8 *)a; be128 r[1]; int i = 15; *r = t->t[ap[15]]; while (i--) { gf128mul_x8_lle(r); be128_xor(r, r, &t->t[ap[i]]); } *a = *r; } EXPORT_SYMBOL(gf128mul_4k_lle); void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t) { u8 *ap = (u8 *)a; be128 r[1]; int i = 0; *r = t->t[ap[0]]; while (++i < 16) { gf128mul_x8_bbe(r); be128_xor(r, r, &t->t[ap[i]]); } *a = *r; } EXPORT_SYMBOL(gf128mul_4k_bbe); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
10 11 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ */ #ifndef __EROFS_XATTR_H #define __EROFS_XATTR_H #include "internal.h" #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> /* Attribute not found */ #define ENOATTR ENODATA #ifdef CONFIG_EROFS_FS_XATTR extern const struct xattr_handler erofs_xattr_user_handler; extern const struct xattr_handler erofs_xattr_trusted_handler; extern const struct xattr_handler erofs_xattr_security_handler; static inline const char *erofs_xattr_prefix(unsigned int idx, struct dentry *dentry) { const struct xattr_handler *handler = NULL; static const struct xattr_handler * const xattr_handler_map[] = { [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, #endif }; if (idx && idx < ARRAY_SIZE(xattr_handler_map)) handler = xattr_handler_map[idx]; if (!xattr_handler_can_list(handler, dentry)) return NULL; return xattr_prefix(handler); } extern const struct xattr_handler * const erofs_xattr_handlers[]; int erofs_xattr_prefixes_init(struct super_block *sb); void erofs_xattr_prefixes_cleanup(struct super_block *sb); int erofs_getxattr(struct inode *, int, const char *, void *, size_t); ssize_t erofs_listxattr(struct dentry *, char *, size_t); #else static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; } static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {} static inline int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { return -EOPNOTSUPP; } #define erofs_listxattr (NULL) #define erofs_xattr_handlers (NULL) #endif /* !CONFIG_EROFS_FS_XATTR */ #ifdef CONFIG_EROFS_FS_POSIX_ACL struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); #else #define erofs_get_acl (NULL) #endif #endif
465 467 453 454 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 // SPDX-License-Identifier: GPL-2.0-only /* * Common framework for low-level network console, dump, and debugger code * * Sep 8 2003 Matt Mackall <mpm@selenic.com> * * based on the netconsole code from: * * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> * Copyright (C) 2002 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/if_vlan.h> #include <net/tcp.h> #include <net/udp.h> #include <net/addrconf.h> #include <net/ndisc.h> #include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <trace/events/napi.h> #include <linux/kconfig.h> /* * We maintain a small pool of fully-sized skbs, to make sure the * message gets out even in extreme OOM situations. */ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 static struct sk_buff_head skb_pool; DEFINE_STATIC_SRCU(netpoll_srcu); #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ (sizeof(struct ethhdr) + \ sizeof(struct iphdr) + \ sizeof(struct udphdr) + \ MAX_UDP_CHUNK) static void zap_completion_queue(void); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); #define np_info(np, fmt, ...) \ pr_info("%s: " fmt, np->name, ##__VA_ARGS__) #define np_err(np, fmt, ...) \ pr_err("%s: " fmt, np->name, ##__VA_ARGS__) #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) { skb = __vlan_hwaccel_push_inside(skb); if (unlikely(!skb)) { /* This is actually a packet drop, but we * don't want the code that calls this * function to try and operate on a NULL skb. */ goto out; } } status = netdev_start_xmit(skb, dev, txq, false); out: return status; } static void queue_process(struct work_struct *work) { struct netpoll_info *npinfo = container_of(work, struct netpoll_info, tx_work.work); struct sk_buff *skb; unsigned long flags; while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; struct netdev_queue *txq; unsigned int q_index; if (!netif_device_present(dev) || !netif_running(dev)) { kfree_skb(skb); continue; } local_irq_save(flags); /* check if skb->queue_mapping is still valid */ q_index = skb_get_queue_mapping(skb); if (unlikely(q_index >= dev->real_num_tx_queues)) { q_index = q_index % dev->real_num_tx_queues; skb_set_queue_mapping(skb, q_index); } txq = netdev_get_tx_queue(dev, q_index); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) { skb_queue_head(&npinfo->txq, skb); HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); schedule_delayed_work(&npinfo->tx_work, HZ/10); return; } HARD_TX_UNLOCK(dev, txq); local_irq_restore(flags); } } static int netif_local_xmit_active(struct net_device *dev) { int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) return 1; } return 0; } static void poll_one_napi(struct napi_struct *napi) { int work; /* If we set this bit but see that it has already been set, * that indicates that napi has been disabled and we need * to abort this operation */ if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) return; /* We explicilty pass the polling call a budget of 0 to * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); trace_napi_poll(napi, work, 0); clear_bit(NAPI_STATE_NPSVC, &napi->state); } static void poll_napi(struct net_device *dev) { struct napi_struct *napi; int cpu = smp_processor_id(); list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { poll_one_napi(napi); smp_store_release(&napi->poll_owner, -1); } } } void netpoll_poll_dev(struct net_device *dev) { struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); const struct net_device_ops *ops; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity * while changing device state */ if (!ni || down_trylock(&ni->dev_lock)) return; /* Some drivers will take the same locks in poll and xmit, * we can't poll if local CPU is already in xmit. */ if (!netif_running(dev) || netif_local_xmit_active(dev)) { up(&ni->dev_lock); return; } ops = dev->netdev_ops; if (ops->ndo_poll_controller) ops->ndo_poll_controller(dev); poll_napi(dev); up(&ni->dev_lock); zap_completion_queue(); } EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; int idx; might_sleep(); idx = srcu_read_lock(&netpoll_srcu); ni = srcu_dereference(dev->npinfo, &netpoll_srcu); if (ni) down(&ni->dev_lock); srcu_read_unlock(&netpoll_srcu, idx); } void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; rcu_read_lock(); ni = rcu_dereference(dev->npinfo); if (ni) up(&ni->dev_lock); rcu_read_unlock(); } static void refill_skbs(void) { struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&skb_pool.lock, flags); while (skb_pool.qlen < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; __skb_queue_tail(&skb_pool, skb); } spin_unlock_irqrestore(&skb_pool.lock, flags); } static void zap_completion_queue(void) { unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); while (clist != NULL) { struct sk_buff *skb = clist; clist = clist->next; if (!skb_irq_freeable(skb)) { refcount_set(&skb->users, 1); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); } } } put_cpu_var(softnet_data); } static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) { int count = 0; struct sk_buff *skb; zap_completion_queue(); refill_skbs(); repeat: skb = alloc_skb(len, GFP_ATOMIC); if (!skb) skb = skb_dequeue(&skb_pool); if (!skb) { if (++count < 10) { netpoll_poll_dev(np->dev); goto repeat; } return NULL; } refcount_set(&skb->users, 1); skb_reserve(skb, reserve); return skb; } static int netpoll_owner_active(struct net_device *dev) { struct napi_struct *napi; list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { if (READ_ONCE(napi->poll_owner) == smp_processor_id()) return 1; } return 0; } /* call with IRQ disabled */ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); dev = np->dev; npinfo = rcu_dereference_bh(dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; txq = netdev_core_pick_tx(dev, skb, NULL); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (HARD_TX_TRYLOCK(dev, txq)) { if (!netif_xmit_stopped(txq)) status = netpoll_start_xmit(skb, dev, txq); HARD_TX_UNLOCK(dev, txq); if (dev_xmit_complete(status)) break; } /* tickle device maybe there is some cleanup */ netpoll_poll_dev(np->dev); udelay(USEC_PER_POLL); } WARN_ONCE(!irqs_disabled(), "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", dev->name, dev->netdev_ops->ndo_start_xmit); } if (!dev_xmit_complete(status)) { skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } return NETDEV_TX_OK; } netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; netdev_tx_t ret; if (unlikely(!np)) { dev_kfree_skb_irq(skb); ret = NET_XMIT_DROP; } else { local_irq_save(flags); ret = __netpoll_send_skb(np, skb); local_irq_restore(flags); } return ret; } EXPORT_SYMBOL(netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; struct udphdr *udph; struct iphdr *iph; struct ethhdr *eth; static atomic_t ip_ident; struct ipv6hdr *ip6h; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!irqs_disabled()); udp_len = len + sizeof(*udph); if (np->ipv6) ip_len = udp_len + sizeof(*ip6h); else ip_len = udp_len + sizeof(*iph); total_len = ip_len + LL_RESERVED_SPACE(np->dev); skb = find_skb(np, total_len + np->dev->needed_tailroom, total_len - len); if (!skb) return; skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); udph = udp_hdr(skb); udph->source = htons(np->local_port); udph->dest = htons(np->remote_port); udph->len = htons(udp_len); if (np->ipv6) { udph->check = 0; udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); /* ip6h->version = 6; ip6h->priority = 0; */ *(unsigned char *)ip6h = 0x60; ip6h->flow_lbl[0] = 0; ip6h->flow_lbl[1] = 0; ip6h->flow_lbl[2] = 0; ip6h->payload_len = htons(sizeof(struct udphdr) + len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = 32; ip6h->saddr = np->local_ip.in6; ip6h->daddr = np->remote_ip.in6; eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { udph->check = 0; udph->check = csum_tcpudp_magic(np->local_ip.ip, np->remote_ip.ip, udp_len, IPPROTO_UDP, csum_partial(udph, udp_len, 0)); if (udph->check == 0) udph->check = CSUM_MANGLED_0; skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); iph = ip_hdr(skb); /* iph->version = 4; iph->ihl = 5; */ *(unsigned char *)iph = 0x45; iph->tos = 0; put_unaligned(htons(ip_len), &(iph->tot_len)); iph->id = htons(atomic_inc_return(&ip_ident)); iph->frag_off = 0; iph->ttl = 64; iph->protocol = IPPROTO_UDP; iph->check = 0; put_unaligned(np->local_ip.ip, &(iph->saddr)); put_unaligned(np->remote_ip.ip, &(iph->daddr)); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IP); } ether_addr_copy(eth->h_source, np->dev->dev_addr); ether_addr_copy(eth->h_dest, np->remote_mac); skb->dev = np->dev; netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); void netpoll_print_options(struct netpoll *np) { np_info(np, "local port %d\n", np->local_port); if (np->ipv6) np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); else np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); np_info(np, "interface '%s'\n", np->dev_name); np_info(np, "remote port %d\n", np->remote_port); if (np->ipv6) np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); else np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); np_info(np, "remote ethernet address %pM\n", np->remote_mac); } EXPORT_SYMBOL(netpoll_print_options); static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) { const char *end; if (!strchr(str, ':') && in4_pton(str, -1, (void *)addr, -1, &end) > 0) { if (!*end) return 0; } if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { #if IS_ENABLED(CONFIG_IPV6) if (!*end) return 1; #else return -1; #endif } return -1; } int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; int ipv6; bool ipversion_set = false; if (*cur != '@') { if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; if (kstrtou16(cur, 10, &np->local_port)) goto parse_failed; cur = delim; } cur++; if (*cur != '/') { ipversion_set = true; if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); if (ipv6 < 0) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim; } cur++; if (*cur != ',') { /* parse out dev name */ if ((delim = strchr(cur, ',')) == NULL) goto parse_failed; *delim = 0; strscpy(np->dev_name, cur, sizeof(np->dev_name)); cur = delim; } cur++; if (*cur != '@') { /* dst port */ if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; if (*cur == ' ' || *cur == '\t') np_info(np, "warning: whitespace is not allowed\n"); if (kstrtou16(cur, 10, &np->remote_port)) goto parse_failed; cur = delim; } cur++; /* dst ip */ if ((delim = strchr(cur, '/')) == NULL) goto parse_failed; *delim = 0; ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); if (ipv6 < 0) goto parse_failed; else if (ipversion_set && np->ipv6 != (bool)ipv6) goto parse_failed; else np->ipv6 = (bool)ipv6; cur = delim + 1; if (*cur != 0) { /* MAC address */ if (!mac_pton(cur, np->remote_mac)) goto parse_failed; } netpoll_print_options(np); return 0; parse_failed: np_info(np, "couldn't parse config at '%s'!\n", cur); return -1; } EXPORT_SYMBOL(netpoll_parse_options); int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; const struct net_device_ops *ops; int err; np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", np->dev_name); err = -ENOTSUPP; goto out; } if (!ndev->npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; goto out; } sema_init(&npinfo->dev_lock, 1); skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); refcount_set(&npinfo->refcnt, 1); ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { err = ops->ndo_netpoll_setup(ndev, npinfo); if (err) goto free_npinfo; } } else { npinfo = rtnl_dereference(ndev->npinfo); refcount_inc(&npinfo->refcnt); } npinfo->netpoll = np; /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); return 0; free_npinfo: kfree(npinfo); out: return err; } EXPORT_SYMBOL_GPL(__netpoll_setup); int netpoll_setup(struct netpoll *np) { struct net_device *ndev = NULL; struct in_device *in_dev; int err; rtnl_lock(); if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; ndev = __dev_get_by_name(net, np->dev_name); } if (!ndev) { np_err(np, "%s doesn't exist, aborting\n", np->dev_name); err = -ENODEV; goto unlock; } netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", np->dev_name); err = -EBUSY; goto put; } if (!netif_running(ndev)) { unsigned long atmost; np_info(np, "device %s not up yet, forcing it\n", np->dev_name); err = dev_open(ndev, NULL); if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } rtnl_unlock(); atmost = jiffies + carrier_timeout * HZ; while (!netif_carrier_ok(ndev)) { if (time_after(jiffies, atmost)) { np_notice(np, "timeout waiting for carrier\n"); break; } msleep(1); } rtnl_lock(); } if (!np->local_ip.ip) { if (!np->ipv6) { const struct in_ifaddr *ifa; in_dev = __in_dev_get_rtnl(ndev); if (!in_dev) goto put_noaddr; ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa) { put_noaddr: np_err(np, "no IP address for %s, aborting\n", np->dev_name); err = -EDESTADDRREQ; goto put; } np->local_ip.ip = ifa->ifa_local; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) struct inet6_dev *idev; err = -EDESTADDRREQ; idev = __in6_dev_get(ndev); if (idev) { struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); list_for_each_entry(ifp, &idev->addr_list, if_list) { if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; err = 0; break; } read_unlock_bh(&idev->lock); } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", np->dev_name); goto put; } else np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); #else np_err(np, "IPv6 is not supported %s, aborting\n", np->dev_name); err = -EINVAL; goto put; #endif } } /* fill up the skb queue */ refill_skbs(); err = __netpoll_setup(np, ndev); if (err) goto put; rtnl_unlock(); return 0; put: netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(netpoll_setup); static int __init netpoll_init(void) { skb_queue_head_init(&skb_pool); return 0; } core_initcall(netpoll_init); static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = container_of(rcu_head, struct netpoll_info, rcu); skb_queue_purge(&npinfo->txq); /* we can't call cancel_delayed_work_sync here, as we are in softirq */ cancel_delayed_work(&npinfo->tx_work); /* clean after last, unfinished work */ __skb_queue_purge(&npinfo->txq); /* now cancel it again */ cancel_delayed_work(&npinfo->tx_work); kfree(npinfo); } void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; npinfo = rtnl_dereference(np->dev->npinfo); if (!npinfo) return; synchronize_srcu(&netpoll_srcu); if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; ops = np->dev->netdev_ops; if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); void __netpoll_free(struct netpoll *np) { ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ synchronize_rcu(); __netpoll_cleanup(np); kfree(np); } EXPORT_SYMBOL_GPL(__netpoll_free); void netpoll_cleanup(struct netpoll *np) { rtnl_lock(); if (!np->dev) goto out; __netpoll_cleanup(np); netdev_put(np->dev, &np->dev_tracker); np->dev = NULL; out: rtnl_unlock(); } EXPORT_SYMBOL(netpoll_cleanup);
605 110 618 618 618 111 111 111 110 101 101 111 618 110 110 110 110 605 605 605 618 618 617 618 618 616 603 603 603 603 605 603 606 605 605 603 603 602 603 2 19 17 17 111 110 111 110 110 110 110 9 101 100 101 925 4177 11174 11121 4177 11134 104 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 // SPDX-License-Identifier: GPL-2.0-only #include <linux/blkdev.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/kthread.h> #include <linux/backing-dev.h> #include <linux/blk-cgroup.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> #include <trace/events/writeback.h> #include "internal.h" struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); static u64 bdi_id_cursor; static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> struct wb_stats { unsigned long nr_dirty; unsigned long nr_io; unsigned long nr_more_io; unsigned long nr_dirty_time; unsigned long nr_writeback; unsigned long nr_reclaimable; unsigned long nr_dirtied; unsigned long nr_written; unsigned long dirty_thresh; unsigned long wb_thresh; }; static struct dentry *bdi_debug_root; static void bdi_debug_init(void) { bdi_debug_root = debugfs_create_dir("bdi", NULL); } static void collect_wb_stats(struct wb_stats *stats, struct bdi_writeback *wb) { struct inode *inode; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) stats->nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) stats->nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) stats->nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) stats->nr_dirty_time++; spin_unlock(&wb->list_lock); stats->nr_writeback += wb_stat(wb, WB_WRITEBACK); stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE); stats->nr_dirtied += wb_stat(wb, WB_DIRTIED); stats->nr_written += wb_stat(wb, WB_WRITTEN); stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh); } #ifdef CONFIG_CGROUP_WRITEBACK static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { struct bdi_writeback *wb; rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { if (!wb_tryget(wb)) continue; collect_wb_stats(stats, wb); wb_put(wb); } rcu_read_unlock(); } #else static void bdi_collect_stats(struct backing_dev_info *bdi, struct wb_stats *stats) { collect_wb_stats(stats, &bdi->wb); } #endif static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct wb_stats stats; unsigned long tot_bw; global_dirty_limits(&background_thresh, &dirty_thresh); memset(&stats, 0, sizeof(stats)); stats.dirty_thresh = dirty_thresh; bdi_collect_stats(bdi, &stats); tot_bw = atomic_long_read(&bdi->tot_write_bandwidth); seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" "BdiDirtyThresh: %10lu kB\n" "DirtyThresh: %10lu kB\n" "BackgroundThresh: %10lu kB\n" "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", K(stats.nr_writeback), K(stats.nr_reclaimable), K(stats.wb_thresh), K(dirty_thresh), K(background_thresh), K(stats.nr_dirtied), K(stats.nr_written), K(tot_bw), stats.nr_dirty, stats.nr_io, stats.nr_more_io, stats.nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb, struct wb_stats *stats) { seq_printf(m, "WbCgIno: %10lu\n" "WbWriteback: %10lu kB\n" "WbReclaimable: %10lu kB\n" "WbDirtyThresh: %10lu kB\n" "WbDirtied: %10lu kB\n" "WbWritten: %10lu kB\n" "WbWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "state: %10lx\n\n", #ifdef CONFIG_CGROUP_WRITEBACK cgroup_ino(wb->memcg_css->cgroup), #else 1ul, #endif K(stats->nr_writeback), K(stats->nr_reclaimable), K(stats->wb_thresh), K(stats->nr_dirtied), K(stats->nr_written), K(wb->avg_write_bandwidth), stats->nr_dirty, stats->nr_io, stats->nr_more_io, stats->nr_dirty_time, wb->state); } static int cgwb_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; unsigned long background_thresh; unsigned long dirty_thresh; struct bdi_writeback *wb; global_dirty_limits(&background_thresh, &dirty_thresh); rcu_read_lock(); list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) { struct wb_stats stats = { .dirty_thresh = dirty_thresh }; if (!wb_tryget(wb)) continue; collect_wb_stats(&stats, wb); /* * Calculate thresh of wb in writeback cgroup which is min of * thresh in global domain and thresh in cgroup domain. Drop * rcu lock because cgwb_calc_thresh may sleep in * cgroup_rstat_flush. We can do so here because we have a ref. */ if (mem_cgroup_wb_domain(wb)) { rcu_read_unlock(); stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb)); rcu_read_lock(); } wb_stats_show(m, wb, &stats); wb_put(wb); } rcu_read_unlock(); return 0; } DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats); static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi, &cgwb_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } #else /* CONFIG_DEBUG_FS */ static inline void bdi_debug_init(void) { } static inline void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { } static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } #endif /* CONFIG_DEBUG_FS */ static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned long read_ahead_kb; ssize_t ret; ret = kstrtoul(buf, 10, &read_ahead_kb); if (ret < 0) return ret; bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); return count; } #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) static ssize_t min_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) static ssize_t max_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio_fine, bdi->max_ratio) static ssize_t min_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); } static ssize_t min_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_min_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(min_bytes); static ssize_t max_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); } static ssize_t max_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_max_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); return sysfs_emit(buf, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); static ssize_t strict_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int strict_limit; ssize_t ret; ret = kstrtouint(buf, 10, &strict_limit); if (ret < 0) return ret; ret = bdi_set_strict_limit(bdi, strict_limit); if (!ret) ret = count; return ret; } static ssize_t strict_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%d\n", !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); } static DEVICE_ATTR_RW(strict_limit); static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); static const struct class bdi_class = { .name = "bdi", .dev_groups = bdi_dev_groups, }; static __init int bdi_class_init(void) { int ret; ret = class_register(&bdi_class); if (ret) return ret; bdi_debug_init(); return 0; } postcore_initcall(bdi_class_init); static int __init default_bdi_init(void) { bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; return 0; } subsys_initcall(default_bdi_init); static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, bw_dwork); wb_update_bandwidth(wb); } /* * Initial write bandwidth: 100 MB/s */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { int err; memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); err = fprop_local_init_percpu(&wb->completions, gfp); if (err) return err; err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS); if (err) fprop_local_destroy_percpu(&wb->completions); return err; } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); /* * Remove bdi from the global list and shutdown any threads we have running */ static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ spin_lock_irq(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_irq(&wb->work_lock); return; } spin_unlock_irq(&wb->work_lock); cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to * be drained no matter what. */ mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) { WARN_ON(delayed_work_pending(&wb->dwork)); percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS); fprop_local_destroy_percpu(&wb->completions); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/memcontrol.h> /* * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); static void cgwb_free_rcu(struct rcu_head *rcu_head) { struct bdi_writeback *wb = container_of(rcu_head, struct bdi_writeback, rcu); percpu_ref_exit(&wb->refcnt); kfree(wb); } static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) { lockdep_assert_held(&cgwb_lock); WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); list_add(&wb->offline_node, &offline_cgwbs); percpu_ref_kill(&wb->refcnt); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { spin_lock_irq(&cgwb_lock); list_del_rcu(&wb->bdi_node); spin_unlock_irq(&cgwb_lock); } static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; int ret = 0; memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb && wb->blkcg_css != blkcg_css) { cgwb_kill(wb); wb = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); if (wb) goto out_put; /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); if (!wb) { ret = -ENOMEM; goto out_put; } ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); if (ret) goto err_wb_exit; ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); if (ret) goto err_ref_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate * whether they're still online. Don't link @wb if any is dead. * See wb_memcg_offline() and wb_blkcg_offline(). */ ret = -ENODEV; spin_lock_irqsave(&cgwb_lock, flags); if (test_bit(WB_registered, &bdi->wb.state) && blkcg_cgwb_list->next && memcg_cgwb_list->next) { /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); blkcg_pin_online(blkcg_css); css_get(memcg_css); css_get(blkcg_css); } } spin_unlock_irqrestore(&cgwb_lock, flags); if (ret) { if (ret == -EEXIST) ret = 0; goto err_fprop_exit; } goto out_put; err_fprop_exit: bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: wb_exit(wb); err_free: kfree(wb); out_put: css_put(blkcg_css); return ret; } /** * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * * Try to get the wb for @memcg_css on @bdi. The returned wb has its * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on * @memcg_css isn't enough. try_get it before calling this function. * * A wb is keyed by its associated memcg. As blkcg implicitly enables * memcg on the default hierarchy, memcg association is guaranteed to be * more specific (equal or descendant to the associated blkcg) and thus can * identify both the memcg and blkcg associations. * * Because the blkcg associated with a memcg may change as blkcg is enabled * and disabled closer to root in the hierarchy, each wb keeps track of * both the memcg and blkcg associated with it and verifies the blkcg on * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css) { struct bdi_writeback *wb; if (!memcg_css->parent) return &bdi->wb; rcu_read_lock(); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb) { struct cgroup_subsys_state *blkcg_css; /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; css_put(blkcg_css); } rcu_read_unlock(); return wb; } /** * wb_get_create - get wb for a given memcg, create if necessary * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * @gfp: allocation mask to use * * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to * create one. See wb_get_lookup() for more details. */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct bdi_writeback *wb; might_alloc(gfp); do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; } static int cgwb_bdi_init(struct backing_dev_info *bdi) { int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); spin_unlock_irq(&cgwb_lock); mutex_lock(&bdi->cgwb_release_mutex); spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); mutex_unlock(&bdi->cgwb_release_mutex); } /* * cleanup_offline_cgwbs_workfn - try to release dying cgwbs * * Try to release dying cgwbs by switching attached inodes to the nearest * living ancestor's writeback. Processed wbs are placed at the end * of the list to guarantee the forward progress. */ static void cleanup_offline_cgwbs_workfn(struct work_struct *work) { struct bdi_writeback *wb; LIST_HEAD(processed); spin_lock_irq(&cgwb_lock); while (!list_empty(&offline_cgwbs)) { wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, offline_node); list_move(&wb->offline_node, &processed); /* * If wb is dirty, cleaning up the writeback by switching * attached inodes will result in an effective removal of any * bandwidth restrictions, which isn't the goal. Instead, * it can be postponed until the next time, when all io * will be likely completed. If in the meantime some inodes * will get re-dirtied, they should be eventually switched to * a new cgwb. */ if (wb_has_dirty_io(wb)) continue; if (!wb_tryget(wb)) continue; spin_unlock_irq(&cgwb_lock); while (cleanup_offline_cgwb(wb)) cond_resched(); spin_lock_irq(&cgwb_lock); wb_put(wb); } if (!list_empty(&processed)) list_splice_tail(&processed, &offline_cgwbs); spin_unlock_irq(&cgwb_lock); } /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined * * Also prevents creation of any new wb's associated with @memcg. */ void wb_memcg_offline(struct mem_cgroup *memcg) { struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); } /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined * @css: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ void wb_blkcg_offline(struct cgroup_subsys_state *css) { struct bdi_writeback *wb, *next; struct list_head *list = blkcg_get_cgwb_list(css); spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, list, blkcg_node) cgwb_kill(wb); list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); spin_unlock_irq(&cgwb_lock); } static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming * system_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); if (!cgwb_release_wq) return -ENOMEM; return 0; } subsys_initcall(cgwb_init); #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) { return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { list_del_rcu(&wb->bdi_node); } #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); bdi->last_bdp_sleep = jiffies; return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { kfree(bdi); return NULL; } bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { struct rb_node **p = &bdi_tree.rb_node; struct rb_node *parent = NULL; struct backing_dev_info *bdi; lockdep_assert_held(&bdi_lock); while (*p) { parent = *p; bdi = rb_entry(parent, struct backing_dev_info, rb_node); if (bdi->id > id) p = &(*p)->rb_left; else if (bdi->id < id) p = &(*p)->rb_right; else break; } if (parentp) *parentp = parent; return p; } /** * bdi_get_by_id - lookup and get bdi from its id * @id: bdi id to lookup * * Find bdi matching @id and get it. Returns NULL if the matching bdi * doesn't exist or is already unregistered. */ struct backing_dev_info *bdi_get_by_id(u64 id) { struct backing_dev_info *bdi = NULL; struct rb_node **p; spin_lock_bh(&bdi_lock); p = bdi_lookup_rb_node(id, NULL); if (*p) { bdi = rb_entry(*p, struct backing_dev_info, rb_node); bdi_get(bdi); } spin_unlock_bh(&bdi_lock); return bdi; } int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); bdi->id = ++bdi_id_cursor; p = bdi_lookup_rb_node(bdi->id, &parent); rb_link_node(&bdi->rb_node, parent, p); rb_insert_color(&bdi->rb_node, &bdi_tree); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); return 0; } int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { va_list args; int ret; va_start(args, fmt); ret = bdi_register_va(bdi, fmt, args); va_end(args); return ret; } EXPORT_SYMBOL(bdi_register); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); } /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); } void bdi_unregister(struct backing_dev_info *bdi) { del_timer_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi); /* * If this BDI's min ratio has been set, use bdi_set_min_ratio() to * update the global bdi_min_ratio. */ if (bdi->min_ratio) bdi_set_min_ratio(bdi, 0); if (bdi->dev) { bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; } if (bdi->owner) { put_device(bdi->owner); bdi->owner = NULL; } } EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); } void bdi_put(struct backing_dev_info *bdi) { kref_put(&bdi->refcnt, release_bdi); } EXPORT_SYMBOL(bdi_put); struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb; if (!inode) return &noop_backing_dev_info; sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } EXPORT_SYMBOL(inode_to_bdi); const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name);
83 196 198 197 198 178 83 82 83 193 197 124 124 124 199 197 82 83 83 83 83 121 81 83 83 82 82 82 83 81 193 173 199 96 16 83 193 194 199 82 198 198 198 196 196 171 171 181 172 198 198 196 50 45 45 45 45 18 45 50 38 12 145 7 145 210 185 21 191 185 206 7 198 211 119 118 14 105 94 95 2 93 3 3 3 3 50 10 53 3 50 13 3 10 30 102 213 140 212 23 23 190 102 192 191 212 213 7 7 5 133 73 73 35 6 8 8 8 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 // SPDX-License-Identifier: GPL-2.0-or-later /* * Symmetric key cipher operations. * * Generic encrypt/decrypt wrapper for ciphers, handles operations across * multiple page boundaries by using temporary blocks. In user context, * the kernel is given a chance to schedule us once per page. * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/bug.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "skcipher.h" #define CRYPTO_ALG_TYPE_SKCIPHER_MASK 0x0000000e enum { SKCIPHER_WALK_PHYS = 1 << 0, SKCIPHER_WALK_SLOW = 1 << 1, SKCIPHER_WALK_COPY = 1 << 2, SKCIPHER_WALK_DIFF = 1 << 3, SKCIPHER_WALK_SLEEP = 1 << 4, }; struct skcipher_walk_buffer { struct list_head entry; struct scatter_walk dst; unsigned int len; u8 *data; u8 buffer[]; }; static const struct crypto_type crypto_skcipher_type; static int skcipher_walk_next(struct skcipher_walk *walk); static inline void skcipher_map_src(struct skcipher_walk *walk) { walk->src.virt.addr = scatterwalk_map(&walk->in); } static inline void skcipher_map_dst(struct skcipher_walk *walk) { walk->dst.virt.addr = scatterwalk_map(&walk->out); } static inline void skcipher_unmap_src(struct skcipher_walk *walk) { scatterwalk_unmap(walk->src.virt.addr); } static inline void skcipher_unmap_dst(struct skcipher_walk *walk) { scatterwalk_unmap(walk->dst.virt.addr); } static inline gfp_t skcipher_walk_gfp(struct skcipher_walk *walk) { return walk->flags & SKCIPHER_WALK_SLEEP ? GFP_KERNEL : GFP_ATOMIC; } /* Get a spot of the specified length that does not straddle a page. * The caller needs to ensure that there is enough space for this operation. */ static inline u8 *skcipher_get_spot(u8 *start, unsigned int len) { u8 *end_page = (u8 *)(((unsigned long)(start + len - 1)) & PAGE_MASK); return max(start, end_page); } static inline struct skcipher_alg *__crypto_skcipher_alg( struct crypto_alg *alg) { return container_of(alg, struct skcipher_alg, base); } static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize) { u8 *addr; addr = (u8 *)ALIGN((unsigned long)walk->buffer, walk->alignmask + 1); addr = skcipher_get_spot(addr, bsize); scatterwalk_copychunks(addr, &walk->out, bsize, (walk->flags & SKCIPHER_WALK_PHYS) ? 2 : 1); return 0; } int skcipher_walk_done(struct skcipher_walk *walk, int err) { unsigned int n = walk->nbytes; unsigned int nbytes = 0; if (!n) goto finish; if (likely(err >= 0)) { n -= err; nbytes = walk->total - n; } if (likely(!(walk->flags & (SKCIPHER_WALK_PHYS | SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF)))) { unmap_src: skcipher_unmap_src(walk); } else if (walk->flags & SKCIPHER_WALK_DIFF) { skcipher_unmap_dst(walk); goto unmap_src; } else if (walk->flags & SKCIPHER_WALK_COPY) { skcipher_map_dst(walk); memcpy(walk->dst.virt.addr, walk->page, n); skcipher_unmap_dst(walk); } else if (unlikely(walk->flags & SKCIPHER_WALK_SLOW)) { if (err > 0) { /* * Didn't process all bytes. Either the algorithm is * broken, or this was the last step and it turned out * the message wasn't evenly divisible into blocks but * the algorithm requires it. */ err = -EINVAL; nbytes = 0; } else n = skcipher_done_slow(walk, n); } if (err > 0) err = 0; walk->total = nbytes; walk->nbytes = 0; scatterwalk_advance(&walk->in, n); scatterwalk_advance(&walk->out, n); scatterwalk_done(&walk->in, 0, nbytes); scatterwalk_done(&walk->out, 1, nbytes); if (nbytes) { crypto_yield(walk->flags & SKCIPHER_WALK_SLEEP ? CRYPTO_TFM_REQ_MAY_SLEEP : 0); return skcipher_walk_next(walk); } finish: /* Short-circuit for the common/fast path. */ if (!((unsigned long)walk->buffer | (unsigned long)walk->page)) goto out; if (walk->flags & SKCIPHER_WALK_PHYS) goto out; if (walk->iv != walk->oiv) memcpy(walk->oiv, walk->iv, walk->ivsize); if (walk->buffer != walk->page) kfree(walk->buffer); if (walk->page) free_page((unsigned long)walk->page); out: return err; } EXPORT_SYMBOL_GPL(skcipher_walk_done); void skcipher_walk_complete(struct skcipher_walk *walk, int err) { struct skcipher_walk_buffer *p, *tmp; list_for_each_entry_safe(p, tmp, &walk->buffers, entry) { u8 *data; if (err) goto done; data = p->data; if (!data) { data = PTR_ALIGN(&p->buffer[0], walk->alignmask + 1); data = skcipher_get_spot(data, walk->stride); } scatterwalk_copychunks(data, &p->dst, p->len, 1); if (offset_in_page(p->data) + p->len + walk->stride > PAGE_SIZE) free_page((unsigned long)p->data); done: list_del(&p->entry); kfree(p); } if (!err && walk->iv != walk->oiv) memcpy(walk->oiv, walk->iv, walk->ivsize); if (walk->buffer != walk->page) kfree(walk->buffer); if (walk->page) free_page((unsigned long)walk->page); } EXPORT_SYMBOL_GPL(skcipher_walk_complete); static void skcipher_queue_write(struct skcipher_walk *walk, struct skcipher_walk_buffer *p) { p->dst = walk->out; list_add_tail(&p->entry, &walk->buffers); } static int skcipher_next_slow(struct skcipher_walk *walk, unsigned int bsize) { bool phys = walk->flags & SKCIPHER_WALK_PHYS; unsigned alignmask = walk->alignmask; struct skcipher_walk_buffer *p; unsigned a; unsigned n; u8 *buffer; void *v; if (!phys) { if (!walk->buffer) walk->buffer = walk->page; buffer = walk->buffer; if (buffer) goto ok; } /* Start with the minimum alignment of kmalloc. */ a = crypto_tfm_ctx_alignment() - 1; n = bsize; if (phys) { /* Calculate the minimum alignment of p->buffer. */ a &= (sizeof(*p) ^ (sizeof(*p) - 1)) >> 1; n += sizeof(*p); } /* Minimum size to align p->buffer by alignmask. */ n += alignmask & ~a; /* Minimum size to ensure p->buffer does not straddle a page. */ n += (bsize - 1) & ~(alignmask | a); v = kzalloc(n, skcipher_walk_gfp(walk)); if (!v) return skcipher_walk_done(walk, -ENOMEM); if (phys) { p = v; p->len = bsize; skcipher_queue_write(walk, p); buffer = p->buffer; } else { walk->buffer = v; buffer = v; } ok: walk->dst.virt.addr = PTR_ALIGN(buffer, alignmask + 1); walk->dst.virt.addr = skcipher_get_spot(walk->dst.virt.addr, bsize); walk->src.virt.addr = walk->dst.virt.addr; scatterwalk_copychunks(walk->src.virt.addr, &walk->in, bsize, 0); walk->nbytes = bsize; walk->flags |= SKCIPHER_WALK_SLOW; return 0; } static int skcipher_next_copy(struct skcipher_walk *walk) { struct skcipher_walk_buffer *p; u8 *tmp = walk->page; skcipher_map_src(walk); memcpy(tmp, walk->src.virt.addr, walk->nbytes); skcipher_unmap_src(walk); walk->src.virt.addr = tmp; walk->dst.virt.addr = tmp; if (!(walk->flags & SKCIPHER_WALK_PHYS)) return 0; p = kmalloc(sizeof(*p), skcipher_walk_gfp(walk)); if (!p) return -ENOMEM; p->data = walk->page; p->len = walk->nbytes; skcipher_queue_write(walk, p); if (offset_in_page(walk->page) + walk->nbytes + walk->stride > PAGE_SIZE) walk->page = NULL; else walk->page += walk->nbytes; return 0; } static int skcipher_next_fast(struct skcipher_walk *walk) { unsigned long diff; walk->src.phys.page = scatterwalk_page(&walk->in); walk->src.phys.offset = offset_in_page(walk->in.offset); walk->dst.phys.page = scatterwalk_page(&walk->out); walk->dst.phys.offset = offset_in_page(walk->out.offset); if (walk->flags & SKCIPHER_WALK_PHYS) return 0; diff = walk->src.phys.offset - walk->dst.phys.offset; diff |= walk->src.virt.page - walk->dst.virt.page; skcipher_map_src(walk); walk->dst.virt.addr = walk->src.virt.addr; if (diff) { walk->flags |= SKCIPHER_WALK_DIFF; skcipher_map_dst(walk); } return 0; } static int skcipher_walk_next(struct skcipher_walk *walk) { unsigned int bsize; unsigned int n; int err; walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF); n = walk->total; bsize = min(walk->stride, max(n, walk->blocksize)); n = scatterwalk_clamp(&walk->in, n); n = scatterwalk_clamp(&walk->out, n); if (unlikely(n < bsize)) { if (unlikely(walk->total < walk->blocksize)) return skcipher_walk_done(walk, -EINVAL); slow_path: err = skcipher_next_slow(walk, bsize); goto set_phys_lowmem; } if (unlikely((walk->in.offset | walk->out.offset) & walk->alignmask)) { if (!walk->page) { gfp_t gfp = skcipher_walk_gfp(walk); walk->page = (void *)__get_free_page(gfp); if (!walk->page) goto slow_path; } walk->nbytes = min_t(unsigned, n, PAGE_SIZE - offset_in_page(walk->page)); walk->flags |= SKCIPHER_WALK_COPY; err = skcipher_next_copy(walk); goto set_phys_lowmem; } walk->nbytes = n; return skcipher_next_fast(walk); set_phys_lowmem: if (!err && (walk->flags & SKCIPHER_WALK_PHYS)) { walk->src.phys.page = virt_to_page(walk->src.virt.addr); walk->dst.phys.page = virt_to_page(walk->dst.virt.addr); walk->src.phys.offset &= PAGE_SIZE - 1; walk->dst.phys.offset &= PAGE_SIZE - 1; } return err; } static int skcipher_copy_iv(struct skcipher_walk *walk) { unsigned a = crypto_tfm_ctx_alignment() - 1; unsigned alignmask = walk->alignmask; unsigned ivsize = walk->ivsize; unsigned bs = walk->stride; unsigned aligned_bs; unsigned size; u8 *iv; aligned_bs = ALIGN(bs, alignmask + 1); /* Minimum size to align buffer by alignmask. */ size = alignmask & ~a; if (walk->flags & SKCIPHER_WALK_PHYS) size += ivsize; else { size += aligned_bs + ivsize; /* Minimum size to ensure buffer does not straddle a page. */ size += (bs - 1) & ~(alignmask | a); } walk->buffer = kmalloc(size, skcipher_walk_gfp(walk)); if (!walk->buffer) return -ENOMEM; iv = PTR_ALIGN(walk->buffer, alignmask + 1); iv = skcipher_get_spot(iv, bs) + aligned_bs; walk->iv = memcpy(iv, walk->iv, walk->ivsize); return 0; } static int skcipher_walk_first(struct skcipher_walk *walk) { if (WARN_ON_ONCE(in_hardirq())) return -EDEADLK; walk->buffer = NULL; if (unlikely(((unsigned long)walk->iv & walk->alignmask))) { int err = skcipher_copy_iv(walk); if (err) return err; } walk->page = NULL; return skcipher_walk_next(walk); } static int skcipher_walk_skcipher(struct skcipher_walk *walk, struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); walk->total = req->cryptlen; walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if (unlikely(!walk->total)) return 0; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); walk->flags &= ~SKCIPHER_WALK_SLEEP; walk->flags |= req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? SKCIPHER_WALK_SLEEP : 0; walk->blocksize = crypto_skcipher_blocksize(tfm); walk->ivsize = crypto_skcipher_ivsize(tfm); walk->alignmask = crypto_skcipher_alignmask(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) walk->stride = alg->co.chunksize; else walk->stride = alg->walksize; return skcipher_walk_first(walk); } int skcipher_walk_virt(struct skcipher_walk *walk, struct skcipher_request *req, bool atomic) { int err; might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP); walk->flags &= ~SKCIPHER_WALK_PHYS; err = skcipher_walk_skcipher(walk, req); walk->flags &= atomic ? ~SKCIPHER_WALK_SLEEP : ~0; return err; } EXPORT_SYMBOL_GPL(skcipher_walk_virt); int skcipher_walk_async(struct skcipher_walk *walk, struct skcipher_request *req) { walk->flags |= SKCIPHER_WALK_PHYS; INIT_LIST_HEAD(&walk->buffers); return skcipher_walk_skcipher(walk, req); } EXPORT_SYMBOL_GPL(skcipher_walk_async); static int skcipher_walk_aead_common(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); int err; walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if (unlikely(!walk->total)) return 0; walk->flags &= ~SKCIPHER_WALK_PHYS; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); scatterwalk_copychunks(NULL, &walk->in, req->assoclen, 2); scatterwalk_copychunks(NULL, &walk->out, req->assoclen, 2); scatterwalk_done(&walk->in, 0, walk->total); scatterwalk_done(&walk->out, 0, walk->total); if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) walk->flags |= SKCIPHER_WALK_SLEEP; else walk->flags &= ~SKCIPHER_WALK_SLEEP; walk->blocksize = crypto_aead_blocksize(tfm); walk->stride = crypto_aead_chunksize(tfm); walk->ivsize = crypto_aead_ivsize(tfm); walk->alignmask = crypto_aead_alignmask(tfm); err = skcipher_walk_first(walk); if (atomic) walk->flags &= ~SKCIPHER_WALK_SLEEP; return err; } int skcipher_walk_aead_encrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { walk->total = req->cryptlen; return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_encrypt); int skcipher_walk_aead_decrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); walk->total = req->cryptlen - crypto_aead_authsize(tfm); return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_decrypt); static void skcipher_set_needkey(struct crypto_skcipher *tfm) { if (crypto_skcipher_max_keysize(tfm) != 0) crypto_skcipher_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_skcipher_alignmask(tfm); struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); u8 *buffer, *alignbuffer; unsigned long absize; int ret; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cipher->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); unsigned long alignmask = crypto_skcipher_alignmask(tfm); int err; if (cipher->co.base.cra_type != &crypto_skcipher_type) { struct crypto_lskcipher **ctx = crypto_skcipher_ctx(tfm); crypto_lskcipher_clear_flags(*ctx, CRYPTO_TFM_REQ_MASK); crypto_lskcipher_set_flags(*ctx, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_lskcipher_setkey(*ctx, key, keylen); goto out; } if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) err = skcipher_setkey_unaligned(tfm, key, keylen); else err = cipher->setkey(tfm, key, keylen); out: if (unlikely(err)) { skcipher_set_needkey(tfm); return err; } crypto_skcipher_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_skcipher_setkey); int crypto_skcipher_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_encrypt_sg(req); return alg->encrypt(req); } EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt); int crypto_skcipher_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_decrypt_sg(req); return alg->decrypt(req); } EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt); static int crypto_lskcipher_export(struct skcipher_request *req, void *out) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); u8 *ivs = skcipher_request_ctx(req); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1); memcpy(out, ivs + crypto_skcipher_ivsize(tfm), crypto_skcipher_statesize(tfm)); return 0; } static int crypto_lskcipher_import(struct skcipher_request *req, const void *in) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); u8 *ivs = skcipher_request_ctx(req); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1); memcpy(ivs + crypto_skcipher_ivsize(tfm), in, crypto_skcipher_statesize(tfm)); return 0; } static int skcipher_noexport(struct skcipher_request *req, void *out) { return 0; } static int skcipher_noimport(struct skcipher_request *req, const void *in) { return 0; } int crypto_skcipher_export(struct skcipher_request *req, void *out) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_export(req, out); return alg->export(req, out); } EXPORT_SYMBOL_GPL(crypto_skcipher_export); int crypto_skcipher_import(struct skcipher_request *req, const void *in) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_import(req, in); return alg->import(req, in); } EXPORT_SYMBOL_GPL(crypto_skcipher_import); static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); alg->exit(skcipher); } static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); skcipher_set_needkey(skcipher); if (tfm->__crt_alg->cra_type != &crypto_skcipher_type) { unsigned am = crypto_skcipher_alignmask(skcipher); unsigned reqsize; reqsize = am & ~(crypto_tfm_ctx_alignment() - 1); reqsize += crypto_skcipher_ivsize(skcipher); reqsize += crypto_skcipher_statesize(skcipher); crypto_skcipher_set_reqsize(skcipher, reqsize); return crypto_init_lskcipher_ops_sg(tfm); } if (alg->exit) skcipher->base.exit = crypto_skcipher_exit_tfm; if (alg->init) return alg->init(skcipher); return 0; } static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg) { if (alg->cra_type != &crypto_skcipher_type) return sizeof(struct crypto_lskcipher *); return crypto_alg_extsize(alg); } static void crypto_skcipher_free_instance(struct crypto_instance *inst) { struct skcipher_instance *skcipher = container_of(inst, struct skcipher_instance, s.base); skcipher->free(skcipher); } static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); seq_printf(m, "type : skcipher\n"); seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? "yes" : "no"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", skcipher->min_keysize); seq_printf(m, "max keysize : %u\n", skcipher->max_keysize); seq_printf(m, "ivsize : %u\n", skcipher->ivsize); seq_printf(m, "chunksize : %u\n", skcipher->chunksize); seq_printf(m, "walksize : %u\n", skcipher->walksize); seq_printf(m, "statesize : %u\n", skcipher->statesize); } static int __maybe_unused crypto_skcipher_report( struct sk_buff *skb, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); struct crypto_report_blkcipher rblkcipher; memset(&rblkcipher, 0, sizeof(rblkcipher)); strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = skcipher->min_keysize; rblkcipher.max_keysize = skcipher->max_keysize; rblkcipher.ivsize = skcipher->ivsize; return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, sizeof(rblkcipher), &rblkcipher); } static const struct crypto_type crypto_skcipher_type = { .extsize = crypto_skcipher_extsize, .init_tfm = crypto_skcipher_init_tfm, .free = crypto_skcipher_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_skcipher_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_skcipher_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_SKCIPHER_MASK, .type = CRYPTO_ALG_TYPE_SKCIPHER, .tfmsize = offsetof(struct crypto_skcipher, base), }; int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_skcipher_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_skcipher); struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_skcipher); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher( const char *alg_name, u32 type, u32 mask) { struct crypto_skcipher *tfm; /* Only sync algorithms allowed. */ mask |= CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE; tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); /* * Make sure we do not allocate something that might get used with * an on-stack request: check the request size. */ if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) > MAX_SYNC_SKCIPHER_REQSIZE)) { crypto_free_skcipher(tfm); return ERR_PTR(-EINVAL); } return (struct crypto_sync_skcipher *)tfm; } EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher); int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_skcipher); int skcipher_prepare_alg_common(struct skcipher_alg_common *alg) { struct crypto_alg *base = &alg->base; if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 || alg->statesize > PAGE_SIZE / 2 || (alg->ivsize + alg->statesize) > PAGE_SIZE / 2) return -EINVAL; if (!alg->chunksize) alg->chunksize = base->cra_blocksize; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; return 0; } static int skcipher_prepare_alg(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg_common(&alg->co); if (err) return err; if (alg->walksize > PAGE_SIZE / 8) return -EINVAL; if (!alg->walksize) alg->walksize = alg->chunksize; if (!alg->statesize) { alg->import = skcipher_noimport; alg->export = skcipher_noexport; } else if (!(alg->import && alg->export)) return -EINVAL; base->cra_type = &crypto_skcipher_type; base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER; return 0; } int crypto_register_skcipher(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_skcipher); void crypto_unregister_skcipher(struct skcipher_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_skcipher); int crypto_register_skciphers(struct skcipher_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_skcipher(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_skciphers); void crypto_unregister_skciphers(struct skcipher_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_skciphers); int skcipher_register_instance(struct crypto_template *tmpl, struct skcipher_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = skcipher_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, skcipher_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(skcipher_register_instance); static int skcipher_setkey_simple(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(cipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_cipher_setkey(cipher, key, keylen); } static int skcipher_init_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst); struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); struct crypto_cipher *cipher; cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->cipher = cipher; return 0; } static void skcipher_exit_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); crypto_free_cipher(ctx->cipher); } static void skcipher_free_instance_simple(struct skcipher_instance *inst) { crypto_drop_cipher(skcipher_instance_ctx(inst)); kfree(inst); } /** * skcipher_alloc_instance_simple - allocate instance of simple block cipher mode * * Allocate an skcipher_instance for a simple block cipher mode of operation, * e.g. cbc or ecb. The instance context will have just a single crypto_spawn, * that for the underlying cipher. The {min,max}_keysize, ivsize, blocksize, * alignmask, and priority are set from the underlying cipher but can be * overridden if needed. The tfm context defaults to skcipher_ctx_simple, and * default ->setkey(), ->init(), and ->exit() methods are installed. * * @tmpl: the template being instantiated * @tb: the template parameters * * Return: a pointer to the new instance, or an ERR_PTR(). The caller still * needs to register the instance. */ struct skcipher_instance *skcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct skcipher_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *cipher_alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return ERR_PTR(err); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = skcipher_instance_ctx(inst); err = crypto_grab_cipher(spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; cipher_alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(skcipher_crypto_instance(inst), tmpl->name, cipher_alg); if (err) goto err_free_inst; inst->free = skcipher_free_instance_simple; /* Default algorithm properties, can be overridden */ inst->alg.base.cra_blocksize = cipher_alg->cra_blocksize; inst->alg.base.cra_alignmask = cipher_alg->cra_alignmask; inst->alg.base.cra_priority = cipher_alg->cra_priority; inst->alg.min_keysize = cipher_alg->cra_cipher.cia_min_keysize; inst->alg.max_keysize = cipher_alg->cra_cipher.cia_max_keysize; inst->alg.ivsize = cipher_alg->cra_blocksize; /* Use skcipher_ctx_simple by default, can be overridden */ inst->alg.base.cra_ctxsize = sizeof(struct skcipher_ctx_simple); inst->alg.setkey = skcipher_setkey_simple; inst->alg.init = skcipher_init_tfm_simple; inst->alg.exit = skcipher_exit_tfm_simple; return inst; err_free_inst: skcipher_free_instance_simple(inst); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skcipher_alloc_instance_simple); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Symmetric key cipher type"); MODULE_IMPORT_NS(CRYPTO_INTERNAL);
47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vb2 #if !defined(_TRACE_VB2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VB2_H #include <linux/tracepoint.h> #include <media/videobuf2-core.h> DECLARE_EVENT_CLASS(vb2_event_class, TP_PROTO(struct vb2_queue *q, struct vb2_buffer *vb), TP_ARGS(q, vb), TP_STRUCT__entry( __field(void *, owner) __field(u32, queued_count) __field(int, owned_by_drv_count) __field(u32, index) __field(u32, type) __field(u32, bytesused) __field(u64, timestamp) ), TP_fast_assign( __entry->owner = q->owner; __entry->queued_count = q->queued_count; __entry->owned_by_drv_count = atomic_read(&q->owned_by_drv_count); __entry->index = vb->index; __entry->type = vb->type; __entry->bytesused = vb->planes[0].bytesused; __entry->timestamp = vb->timestamp; ), TP_printk("owner = %p, queued = %u, owned_by_drv = %d, index = %u, " "type = %u, bytesused = %u, timestamp = %llu", __entry->owner, __entry->queued_count, __entry->owned_by_drv_count, __entry->index, __entry->type, __entry->bytesused, __entry->timestamp ) ) DEFINE_EVENT(vb2_event_class, vb2_buf_done, TP_PROTO(struct vb2_queue *q, struct vb2_buffer *vb), TP_ARGS(q, vb) ); DEFINE_EVENT(vb2_event_class, vb2_buf_queue, TP_PROTO(struct vb2_queue *q, struct vb2_buffer *vb), TP_ARGS(q, vb) ); DEFINE_EVENT(vb2_event_class, vb2_dqbuf, TP_PROTO(struct vb2_queue *q, struct vb2_buffer *vb), TP_ARGS(q, vb) ); DEFINE_EVENT(vb2_event_class, vb2_qbuf, TP_PROTO(struct vb2_queue *q, struct vb2_buffer *vb), TP_ARGS(q, vb) ); #endif /* if !defined(_TRACE_VB2_H) || defined(TRACE_HEADER_MULTI_READ) */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_ESP_H #define _NET_ESP_H #include <linux/skbuff.h> struct ip_esp_hdr; struct xfrm_state; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) { return (struct ip_esp_hdr *)skb_transport_header(skb); } static inline void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) { /* Fill padding... */ if (tfclen) { memset(tail, 0, tfclen); tail += tfclen; } do { int i; for (i = 0; i < plen - 2; i++) tail[i] = i + 1; } while (0); tail[plen - 2] = plen - 2; tail[plen - 1] = proto; } struct esp_info { struct ip_esp_hdr *esph; __be64 seqno; int tfclen; int tailen; int plen; int clen; int len; int nfrags; __u8 proto; bool inplace; }; int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp_input_done2(struct sk_buff *skb, int err); int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); int esp6_input_done2(struct sk_buff *skb, int err); #endif
9 8 8 4 5 9 7 7 6 4 3 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 // SPDX-License-Identifier: GPL-2.0-only /* * ebt_ip * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * April, 2002 * * Changes: * added ip-sport and ip-dport * Innominate Security Technologies AG <mhopf@innominate.com> * September, 2002 */ #include <linux/ip.h> #include <net/ip.h> #include <linux/in.h> #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ip.h> union pkthdr { struct { __be16 src; __be16 dst; } tcpudphdr; struct { u8 type; u8 code; } icmphdr; struct { u8 type; } igmphdr; }; static bool ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_ip_info *info = par->matchinfo; const struct iphdr *ih; struct iphdr _iph; const union pkthdr *pptr; union pkthdr _pkthdr; ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) return false; if ((info->bitmask & EBT_IP_TOS) && NF_INVF(info, EBT_IP_TOS, info->tos != ih->tos)) return false; if ((info->bitmask & EBT_IP_SOURCE) && NF_INVF(info, EBT_IP_SOURCE, (ih->saddr & info->smsk) != info->saddr)) return false; if ((info->bitmask & EBT_IP_DEST) && NF_INVF(info, EBT_IP_DEST, (ih->daddr & info->dmsk) != info->daddr)) return false; if (info->bitmask & EBT_IP_PROTO) { if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT | EBT_IP_ICMP | EBT_IP_IGMP))) return true; if (ntohs(ih->frag_off) & IP_OFFSET) return false; /* min icmp/igmp headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, ih->ihl*4, sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP_DPORT) { u32 dst = ntohs(pptr->tcpudphdr.dst); if (NF_INVF(info, EBT_IP_DPORT, dst < info->dport[0] || dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP_SPORT) { u32 src = ntohs(pptr->tcpudphdr.src); if (NF_INVF(info, EBT_IP_SPORT, src < info->sport[0] || src > info->sport[1])) return false; } if ((info->bitmask & EBT_IP_ICMP) && NF_INVF(info, EBT_IP_ICMP, pptr->icmphdr.type < info->icmp_type[0] || pptr->icmphdr.type > info->icmp_type[1] || pptr->icmphdr.code < info->icmp_code[0] || pptr->icmphdr.code > info->icmp_code[1])) return false; if ((info->bitmask & EBT_IP_IGMP) && NF_INVF(info, EBT_IP_IGMP, pptr->igmphdr.type < info->igmp_type[0] || pptr->igmphdr.type > info->igmp_type[1])) return false; } return true; } static int ebt_ip_mt_check(const struct xt_mtchk_param *par) { const struct ebt_ip_info *info = par->matchinfo; const struct ebt_entry *e = par->entryinfo; if (e->ethproto != htons(ETH_P_IP) || e->invflags & EBT_IPROTO) return -EINVAL; if (info->bitmask & ~EBT_IP_MASK || info->invflags & ~EBT_IP_MASK) return -EINVAL; if (info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT)) { if (info->invflags & EBT_IP_PROTO) return -EINVAL; if (info->protocol != IPPROTO_TCP && info->protocol != IPPROTO_UDP && info->protocol != IPPROTO_UDPLITE && info->protocol != IPPROTO_SCTP && info->protocol != IPPROTO_DCCP) return -EINVAL; } if (info->bitmask & EBT_IP_DPORT && info->dport[0] > info->dport[1]) return -EINVAL; if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; if (info->bitmask & EBT_IP_ICMP) { if ((info->invflags & EBT_IP_PROTO) || info->protocol != IPPROTO_ICMP) return -EINVAL; if (info->icmp_type[0] > info->icmp_type[1] || info->icmp_code[0] > info->icmp_code[1]) return -EINVAL; } if (info->bitmask & EBT_IP_IGMP) { if ((info->invflags & EBT_IP_PROTO) || info->protocol != IPPROTO_IGMP) return -EINVAL; if (info->igmp_type[0] > info->igmp_type[1]) return -EINVAL; } return 0; } static struct xt_match ebt_ip_mt_reg __read_mostly = { .name = "ip", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_ip_mt, .checkentry = ebt_ip_mt_check, .matchsize = sizeof(struct ebt_ip_info), .me = THIS_MODULE, }; static int __init ebt_ip_init(void) { return xt_register_match(&ebt_ip_mt_reg); } static void __exit ebt_ip_fini(void) { xt_unregister_match(&ebt_ip_mt_reg); } module_init(ebt_ip_init); module_exit(ebt_ip_fini); MODULE_DESCRIPTION("Ebtables: IPv4 protocol packet match"); MODULE_LICENSE("GPL");
146 148 5 5 5 8 8 8 22 149 86 86 86 86 85 8 2 8 121 10 10 8 2 8 2 2 2 2 8 8 150 86 85 86 149 112 8 15 146 8 9 7 3 1 2 148 135 136 82 117 148 11 119 118 119 119 148 4 146 147 148 115 116 116 116 148 148 148 148 148 148 119 6 147 148 117 117 115 117 19 117 117 116 116 116 117 116 117 117 117 116 117 117 117 117 117 117 117 117 117 113 113 113 113 117 117 116 117 148 147 4 4 4 148 147 117 117 117 117 4 145 106 9 6 6 6 6 6 112 112 112 112 112 111 111 112 106 106 112 112 112 112 112 4 112 106 111 106 112 112 112 146 148 148 6 6 58 2 150 148 148 148 1 148 10 148 148 8 8 8 8 8 8 8 7 8 8 8 8 18 19 19 19 18 9 19 2 2 2 145 148 148 9 9 9 148 147 148 148 147 148 147 148 148 147 148 148 148 148 148 149 18 150 149 150 26 26 149 149 150 150 148 150 149 6 4 146 148 79 80 80 80 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 20 4 4 4 4 4 4 4 106 3 3 3 3 2 2 2 2 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 85 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/buffer_head.h> #include <linux/delay.h> #include <linux/sort.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/module.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/workqueue.h> #include <linux/jiffies.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> #include <linux/bit_spinlock.h> #include <linux/percpu.h> #include <linux/list_sort.h> #include <linux/lockref.h> #include <linux/rhashtable.h> #include <linux/pid_namespace.h> #include <linux/fdtable.h> #include <linux/file.h> #include "gfs2.h" #include "incore.h" #include "glock.h" #include "glops.h" #include "inode.h" #include "lops.h" #include "meta_io.h" #include "quota.h" #include "super.h" #include "util.h" #include "bmap.h" #define CREATE_TRACE_POINTS #include "trace_gfs2.h" struct gfs2_glock_iter { struct gfs2_sbd *sdp; /* incore superblock */ struct rhashtable_iter hti; /* rhashtable iterator */ struct gfs2_glock *gl; /* current glock struct */ loff_t last_pos; /* last position */ }; typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static void request_demote(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote); static struct dentry *gfs2_root; static LIST_HEAD(lru_list); static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) static const struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), .head_offset = offsetof(struct gfs2_glock, gl_node), }; static struct rhashtable gl_hash_table; #define GLOCK_WAIT_TABLE_BITS 12 #define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; struct wait_glock_queue { struct lm_lockname *name; wait_queue_entry_t wait; }; static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct wait_glock_queue *wait_glock = container_of(wait, struct wait_glock_queue, wait); struct lm_lockname *wait_name = wait_glock->name; struct lm_lockname *wake_name = key; if (wake_name->ln_sbd != wait_name->ln_sbd || wake_name->ln_number != wait_name->ln_number || wake_name->ln_type != wait_name->ln_type) return 0; return autoremove_wake_function(wait, mode, sync, key); } static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) { u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0); return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); } /** * wake_up_glock - Wake up waiters on a glock * @gl: the glock */ static void wake_up_glock(struct gfs2_glock *gl) { wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); } static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); kfree(gl->gl_lksb.sb_lvbptr); if (gl->gl_ops->go_flags & GLOF_ASPACE) { struct gfs2_glock_aspace *gla = container_of(gl, struct gfs2_glock_aspace, glock); kmem_cache_free(gfs2_glock_aspace_cachep, gla); } else kmem_cache_free(gfs2_glock_cachep, gl); } /** * glock_blocked_by_withdraw - determine if we can still use a glock * @gl: the glock * * We need to allow some glocks to be enqueued, dequeued, promoted, and demoted * when we're withdrawn. For example, to maintain metadata integrity, we should * disallow the use of inode and rgrp glocks when withdrawn. Other glocks like * the iopen or freeze glock may be safely used because none of their * metadata goes through the journal. So in general, we should disallow all * glocks that are journaled, and allow all the others. One exception is: * we need to allow our active journal to be promoted and demoted so others * may recover it and we can reacquire it when they're done. */ static bool glock_blocked_by_withdraw(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!gfs2_withdrawing_or_withdrawn(sdp)) return false; if (gl->gl_ops->go_flags & GLOF_NONDISK) return false; if (!sdp->sd_jdesc || gl->gl_name.ln_number == sdp->sd_jdesc->jd_no_addr) return false; return true; } static void __gfs2_glock_free(struct gfs2_glock *gl) { rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); } void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; __gfs2_glock_free(gl); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); } void gfs2_glock_free_later(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; spin_lock(&lru_lock); list_add(&gl->gl_lru, &sdp->sd_dead_glocks); spin_unlock(&lru_lock); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); } static void gfs2_free_dead_glocks(struct gfs2_sbd *sdp) { struct list_head *list = &sdp->sd_dead_glocks; while(!list_empty(list)) { struct gfs2_glock *gl; gl = list_first_entry(list, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); __gfs2_glock_free(gl); } } /** * gfs2_glock_hold() - increment reference count on glock * @gl: The glock to hold * */ struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); return gl; } static void gfs2_glock_add_to_lru(struct gfs2_glock *gl) { spin_lock(&lru_lock); list_move_tail(&gl->gl_lru, &lru_list); if (!test_bit(GLF_LRU, &gl->gl_flags)) { set_bit(GLF_LRU, &gl->gl_flags); atomic_inc(&lru_count); } spin_unlock(&lru_lock); } static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { spin_lock(&lru_lock); if (test_bit(GLF_LRU, &gl->gl_flags)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); } spin_unlock(&lru_lock); } /* * Enqueue the glock on the work queue. Passes one glock reference on to the * work queue. */ static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!queue_delayed_work(sdp->sd_glock_wq, &gl->gl_work, delay)) { /* * We are holding the lockref spinlock, and the work was still * queued above. The queued work (glock_work_func) takes that * spinlock before dropping its glock reference(s), so it * cannot have dropped them in the meantime. */ GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2); gl->gl_lockref.count--; } } static void __gfs2_glock_put(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); lockref_mark_dead(&gl->gl_lockref); spin_unlock(&gl->gl_lockref.lock); gfs2_glock_remove_from_lru(gl); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); if (mapping) { truncate_inode_pages_final(mapping); if (!gfs2_withdrawing_or_withdrawn(sdp)) GLOCK_BUG_ON(gl, !mapping_empty(mapping)); } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } static bool __gfs2_glock_put_or_lock(struct gfs2_glock *gl) { if (lockref_put_or_lock(&gl->gl_lockref)) return true; GLOCK_BUG_ON(gl, gl->gl_lockref.count != 1); if (gl->gl_state != LM_ST_UNLOCKED) { gl->gl_lockref.count--; gfs2_glock_add_to_lru(gl); spin_unlock(&gl->gl_lockref.lock); return true; } return false; } /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * */ void gfs2_glock_put(struct gfs2_glock *gl) { if (__gfs2_glock_put_or_lock(gl)) return; __gfs2_glock_put(gl); } /* * gfs2_glock_put_async - Decrement reference count without sleeping * @gl: The glock to put * * Decrement the reference count on glock immediately unless it is the last * reference. Defer putting the last reference to work queue context. */ void gfs2_glock_put_async(struct gfs2_glock *gl) { if (__gfs2_glock_put_or_lock(gl)) return; gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } /** * may_grant - check if it's ok to grant a new lock * @gl: The glock * @current_gh: One of the current holders of @gl * @gh: The lock request which we wish to grant * * With our current compatibility rules, if a glock has one or more active * holders (HIF_HOLDER flag set), any of those holders can be passed in as * @current_gh; they are all the same as far as compatibility with the new @gh * goes. * * Returns true if it's ok to grant the lock. */ static inline bool may_grant(struct gfs2_glock *gl, struct gfs2_holder *current_gh, struct gfs2_holder *gh) { if (current_gh) { GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, &current_gh->gh_iflags)); switch(current_gh->gh_state) { case LM_ST_EXCLUSIVE: /* * Here we make a special exception to grant holders * who agree to share the EX lock with other holders * who also have the bit set. If the original holder * has the LM_FLAG_NODE_SCOPE bit set, we grant more * holders with the bit set. */ return gh->gh_state == LM_ST_EXCLUSIVE && (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) && (gh->gh_flags & LM_FLAG_NODE_SCOPE); case LM_ST_SHARED: case LM_ST_DEFERRED: return gh->gh_state == current_gh->gh_state; default: return false; } } if (gl->gl_state == gh->gh_state) return true; if (gh->gh_flags & GL_EXACT) return false; if (gl->gl_state == LM_ST_EXCLUSIVE) { return gh->gh_state == LM_ST_SHARED || gh->gh_state == LM_ST_DEFERRED; } if (gh->gh_flags & LM_FLAG_ANY) return gl->gl_state != LM_ST_UNLOCKED; return false; } static void gfs2_holder_wake(struct gfs2_holder *gh) { clear_bit(HIF_WAIT, &gh->gh_iflags); smp_mb__after_atomic(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); if (gh->gh_flags & GL_ASYNC) { struct gfs2_sbd *sdp = gh->gh_gl->gl_name.ln_sbd; wake_up(&sdp->sd_async_glock_wait); } } /** * do_error - Something unexpected has happened during a lock request * @gl: The glock * @ret: The status from the DLM */ static void do_error(struct gfs2_glock *gl, const int ret) { struct gfs2_holder *gh, *tmp; list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (ret & LM_OUT_ERROR) gh->gh_error = -EIO; else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) gh->gh_error = GLR_TRYFAILED; else continue; list_del_init(&gh->gh_list); trace_gfs2_glock_queue(gh, 0); gfs2_holder_wake(gh); } } /** * find_first_holder - find the first "holder" gh * @gl: the glock */ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) { struct gfs2_holder *gh; if (!list_empty(&gl->gl_holders)) { gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); if (test_bit(HIF_HOLDER, &gh->gh_iflags)) return gh; } return NULL; } /* * gfs2_instantiate - Call the glops instantiate function * @gh: The glock holder * * Returns: 0 if instantiate was successful, or error. */ int gfs2_instantiate(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; const struct gfs2_glock_operations *glops = gl->gl_ops; int ret; again: if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags)) goto done; /* * Since we unlock the lockref lock, we set a flag to indicate * instantiate is in progress. */ if (test_and_set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG, TASK_UNINTERRUPTIBLE); /* * Here we just waited for a different instantiate to finish. * But that may not have been successful, as when a process * locks an inode glock _before_ it has an actual inode to * instantiate into. So we check again. This process might * have an inode to instantiate, so might be successful. */ goto again; } ret = glops->go_instantiate(gl); if (!ret) clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); if (ret) return ret; done: if (glops->go_held) return glops->go_held(gh); return 0; } /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock * * Returns true on success (i.e., progress was made or there are no waiters). */ static bool do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; current_gh = find_first_holder(gl); list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (!may_grant(gl, current_gh, gh)) { /* * If we get here, it means we may not grant this * holder for some reason. If this holder is at the * head of the list, it means we have a blocked holder * at the head, so return false. */ if (list_is_first(&gh->gh_list, &gl->gl_holders)) return false; do_error(gl, 0); break; } set_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_promote(gh); gfs2_holder_wake(gh); if (!current_gh) current_gh = gh; } return true; } /** * find_first_waiter - find the first gh that's waiting for the glock * @gl: the glock */ static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) { struct gfs2_holder *gh; list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) return gh; } return NULL; } /** * find_last_waiter - find the last gh that's waiting for the glock * @gl: the glock * * This also is a fast way of finding out if there are any waiters. */ static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl) { struct gfs2_holder *gh; if (list_empty(&gl->gl_holders)) return NULL; gh = list_last_entry(&gl->gl_holders, struct gfs2_holder, gh_list); return test_bit(HIF_HOLDER, &gh->gh_iflags) ? NULL : gh; } /** * state_change - record that the glock is now in a different state * @gl: the glock * @new_state: the new state */ static void state_change(struct gfs2_glock *gl, unsigned int new_state) { if (new_state != gl->gl_target) /* shorten our minimum hold time */ gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, GL_GLOCK_MIN_HOLD); gl->gl_state = new_state; gl->gl_tchange = jiffies; } static void gfs2_set_demote(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; set_bit(GLF_DEMOTE, &gl->gl_flags); smp_mb(); wake_up(&sdp->sd_async_glock_wait); } static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); smp_mb__after_atomic(); wake_up_bit(&gl->gl_flags, GLF_DEMOTE); } /** * finish_xmote - The DLM has replied to one of our lock requests * @gl: The glock * @ret: The status from the DLM * */ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh; unsigned state = ret & LM_OUT_ST_MASK; trace_gfs2_glock_state_change(gl, state); state_change(gl, state); gh = find_first_waiter(gl); /* Demote to UN request arrived during demote to SH or DF */ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) gl->gl_target = LM_ST_UNLOCKED; /* Check for state != intended state */ if (unlikely(state != gl->gl_target)) { if (gh && (ret & LM_OUT_CANCELED)) gfs2_holder_wake(gh); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { list_move_tail(&gh->gh_list, &gl->gl_holders); gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (do_promote(gl)) goto out; goto retry; } /* Some error or failed "try lock" - report it */ if ((ret & LM_OUT_ERROR) || (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { gl->gl_target = gl->gl_state; do_error(gl, ret); goto out; } } switch(state) { /* Unlocked due to conversion deadlock, try again */ case LM_ST_UNLOCKED: retry: do_xmote(gl, gh, gl->gl_target); break; /* Conversion fails, unlock and try again */ case LM_ST_SHARED: case LM_ST_DEFERRED: do_xmote(gl, gh, LM_ST_UNLOCKED); break; default: /* Everything else */ fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n", gl->gl_target, state); GLOCK_BUG_ON(gl, 1); } return; } /* Fast path - we got what we asked for */ if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) gfs2_demote_wake(gl); if (state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { int rv; spin_unlock(&gl->gl_lockref.lock); rv = glops->go_xmote_bh(gl); spin_lock(&gl->gl_lockref.lock); if (rv) { do_error(gl, rv); goto out; } } do_promote(gl); } out: clear_bit(GLF_LOCK, &gl->gl_flags); } static bool is_system_glock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); if (gl == m_ip->i_gl) return true; return false; } /** * do_xmote - Calls the DLM to change the state of a lock * @gl: The lock state * @gh: The holder (only for promotes) * @target: The target lock state * */ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) goto skip_inval; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && glops->go_inval) { /* * If another process is already doing the invalidate, let that * finish first. The glock state machine will get back to this * holder again later. */ if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) return; do_error(gl, 0); /* Fail queued try locks */ } gl->gl_req = target; set_bit(GLF_BLOCKING, &gl->gl_flags); if ((gl->gl_req == LM_ST_UNLOCKED) || (gl->gl_state == LM_ST_EXCLUSIVE) || (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) clear_bit(GLF_BLOCKING, &gl->gl_flags); if (!glops->go_inval && !glops->go_sync) goto skip_inval; spin_unlock(&gl->gl_lockref.lock); if (glops->go_sync) { ret = glops->go_sync(gl); /* If we had a problem syncing (due to io errors or whatever, * we should not invalidate the metadata or tell dlm to * release the glock to other nodes. */ if (ret) { if (cmpxchg(&sdp->sd_log_error, 0, ret)) { fs_err(sdp, "Error %d syncing glock \n", ret); gfs2_dump_glock(NULL, gl, true); } spin_lock(&gl->gl_lockref.lock); goto skip_inval; } } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { /* * The call to go_sync should have cleared out the ail list. * If there are still items, we have a problem. We ought to * withdraw, but we can't because the withdraw code also uses * glocks. Warn about the error, dump the glock, then fall * through and wait for logd to do the withdraw for us. */ if ((atomic_read(&gl->gl_ail_count) != 0) && (!cmpxchg(&sdp->sd_log_error, 0, -EIO))) { gfs2_glock_assert_warn(gl, !atomic_read(&gl->gl_ail_count)); gfs2_dump_glock(NULL, gl, true); } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } spin_lock(&gl->gl_lockref.lock); skip_inval: gl->gl_lockref.count++; /* * Check for an error encountered since we called go_sync and go_inval. * If so, we can't withdraw from the glock code because the withdraw * code itself uses glocks (see function signal_our_withdraw) to * change the mount to read-only. Most importantly, we must not call * dlm to unlock the glock until the journal is in a known good state * (after journal replay) otherwise other nodes may use the object * (rgrp or dinode) and then later, journal replay will corrupt the * file system. The best we can do here is wait for the logd daemon * to see sd_log_error and withdraw, and in the meantime, requeue the * work for later. * * We make a special exception for some system glocks, such as the * system statfs inode glock, which needs to be granted before the * gfs2_quotad daemon can exit, and that exit needs to finish before * we can unmount the withdrawn file system. * * However, if we're just unlocking the lock (say, for unmount, when * gfs2_gl_hash_clear calls clear_glock) and recovery is complete * then it's okay to tell dlm to unlock it. */ if (unlikely(sdp->sd_log_error) && !gfs2_withdrawing_or_withdrawn(sdp)) gfs2_withdraw_delayed(sdp); if (glock_blocked_by_withdraw(gl) && (target != LM_ST_UNLOCKED || test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { if (!is_system_glock(gl)) { request_demote(gl, LM_ST_UNLOCKED, 0, false); /* * Ordinarily, we would call dlm and its callback would call * finish_xmote, which would call state_change() to the new state. * Since we withdrew, we won't call dlm, so call state_change * manually, but to the UNLOCKED state we desire. */ state_change(gl, LM_ST_UNLOCKED); /* * We skip telling dlm to do the locking, so we won't get a * reply that would otherwise clear GLF_LOCK. So we clear it here. */ clear_bit(GLF_LOCK, &gl->gl_flags); clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); return; } else { clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } } if (ls->ls_ops->lm_lock) { spin_unlock(&gl->gl_lockref.lock); ret = ls->ls_ops->lm_lock(gl, target, lck_flags); spin_lock(&gl->gl_lockref.lock); if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && target == LM_ST_UNLOCKED && test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { /* * The lockspace has been released and the lock has * been unlocked implicitly. */ } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); target = gl->gl_state | LM_OUT_ERROR; } else { /* The operation will be completed asynchronously. */ return; } } /* Complete the operation now. */ finish_xmote(gl, target); gfs2_glock_queue_work(gl, 0); } /** * run_queue - do all outstanding tasks related to a glock * @gl: The glock in question * @nonblock: True if we must not block in run_queue * */ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { struct gfs2_holder *gh = NULL; if (test_bit(GLF_LOCK, &gl->gl_flags)) return; set_bit(GLF_LOCK, &gl->gl_flags); GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && gl->gl_demote_state != gl->gl_state) { if (find_first_holder(gl)) goto out_unlock; if (nonblock) goto out_sched; set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ } do_xmote(gl, gh, gl->gl_target); return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_atomic(); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); return; out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_atomic(); } /** * glock_set_object - set the gl_object field of a glock * @gl: the glock * @object: the object */ void glock_set_object(struct gfs2_glock *gl, void *object) { void *prev_object; spin_lock(&gl->gl_lockref.lock); prev_object = gl->gl_object; gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) { pr_warn("glock=%u/%llx\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); gfs2_dump_glock(NULL, gl, true); } } /** * glock_clear_object - clear the gl_object field of a glock * @gl: the glock * @object: object the glock currently points at */ void glock_clear_object(struct gfs2_glock *gl, void *object) { void *prev_object; spin_lock(&gl->gl_lockref.lock); prev_object = gl->gl_object; gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) { pr_warn("glock=%u/%llx\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); gfs2_dump_glock(NULL, gl, true); } } void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) { struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; if (ri->ri_magic == 0) ri->ri_magic = cpu_to_be32(GFS2_MAGIC); if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC)) ri->ri_generation_deleted = cpu_to_be64(generation); } bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) { struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC)) return false; return generation <= be64_to_cpu(ri->ri_generation_deleted); } static void gfs2_glock_poke(struct gfs2_glock *gl) { int flags = LM_FLAG_TRY_1CB | LM_FLAG_ANY | GL_SKIP; struct gfs2_holder gh; int error; __gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_); error = gfs2_glock_nq(&gh); if (!error) gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); } static bool gfs2_try_evict(struct gfs2_glock *gl) { struct gfs2_inode *ip; bool evicted = false; /* * If there is contention on the iopen glock and we have an inode, try * to grab and release the inode so that it can be evicted. This will * allow the remote node to go ahead and delete the inode without us * having to do it, which will avoid rgrp glock thrashing. * * The remote node is likely still holding the corresponding inode * glock, so it will run before we get to verify that the delete has * happened below. */ spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; if (ip && !igrab(&ip->i_inode)) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { gl->gl_no_formal_ino = ip->i_no_formal_ino; set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); d_prune_aliases(&ip->i_inode); iput(&ip->i_inode); /* If the inode was evicted, gl->gl_object will now be NULL. */ spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; if (ip) { clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags); if (!igrab(&ip->i_inode)) ip = NULL; } spin_unlock(&gl->gl_lockref.lock); if (ip) { gfs2_glock_poke(ip->i_gl); iput(&ip->i_inode); } evicted = !ip; } return evicted; } bool gfs2_queue_try_to_evict(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) return false; return queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } static bool gfs2_queue_verify_evict(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (test_and_set_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) return false; return queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 5 * HZ); } static void delete_work_func(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct inode *inode; u64 no_addr = gl->gl_name.ln_number; if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) { /* * If we can evict the inode, give the remote node trying to * delete the inode some time before verifying that the delete * has happened. Otherwise, if we cause contention on the inode glock * immediately, the remote node will think that we still have * the inode in use, and so it will give up waiting. * * If we can't evict the inode, signal to the remote node that * the inode is still in use. We'll later try to delete the * inode locally in gfs2_evict_inode. * * FIXME: We only need to verify that the remote node has * deleted the inode because nodes before this remote delete * rework won't cooperate. At a later time, when we no longer * care about compatibility with such nodes, we can skip this * step entirely. */ if (gfs2_try_evict(gl)) { if (test_bit(SDF_KILL, &sdp->sd_flags)) goto out; if (gfs2_queue_verify_evict(gl)) return; } goto out; } if (test_and_clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) { inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, GFS2_BLKST_UNLINKED); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -EAGAIN && !test_bit(SDF_KILL, &sdp->sd_flags) && gfs2_queue_verify_evict(gl)) return; } else { d_prune_aliases(inode); iput(inode); } } out: gfs2_glock_put(gl); } static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); unsigned int drop_refs = 1; spin_lock(&gl->gl_lockref.lock); if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags)) { clear_bit(GLF_HAVE_REPLY, &gl->gl_flags); finish_xmote(gl, gl->gl_reply); drop_refs++; } if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { if (gl->gl_name.ln_type == LM_TYPE_INODE) { unsigned long holdtime, now = jiffies; holdtime = gl->gl_tchange + gl->gl_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; } if (!delay) { clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); gfs2_set_demote(gl); } } run_queue(gl, 0); if (delay) { /* Keep one glock reference for the work we requeue. */ drop_refs--; gfs2_glock_queue_work(gl, delay); } /* Drop the remaining glock references manually. */ GLOCK_BUG_ON(gl, gl->gl_lockref.count < drop_refs); gl->gl_lockref.count -= drop_refs; if (!gl->gl_lockref.count) { if (gl->gl_state == LM_ST_UNLOCKED) { __gfs2_glock_put(gl); return; } gfs2_glock_add_to_lru(gl); } spin_unlock(&gl->gl_lockref.lock); } static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, struct gfs2_glock *new) { struct wait_glock_queue wait; wait_queue_head_t *wq = glock_waitqueue(name); struct gfs2_glock *gl; wait.name = name; init_wait(&wait.wait); wait.wait.func = glock_wake_function; again: prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); rcu_read_lock(); if (new) { gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, &new->gl_node, ht_parms); if (IS_ERR(gl)) goto out; } else { gl = rhashtable_lookup_fast(&gl_hash_table, name, ht_parms); } if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { rcu_read_unlock(); schedule(); goto again; } out: rcu_read_unlock(); finish_wait(wq, &wait.wait); if (gl) gfs2_glock_remove_from_lru(gl); return gl; } /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock * @number: the lock number * @glops: The glock_operations to use * @create: If 0, don't create the glock if it doesn't exist * @glp: the glock is returned here * * This does not lock a glock, just finds/creates structures for one. * * Returns: errno */ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp) { struct super_block *s = sdp->sd_vfs; struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type, .ln_sbd = sdp }; struct gfs2_glock *gl, *tmp; struct address_space *mapping; gl = find_insert_glock(&name, NULL); if (gl) goto found; if (!create) return -ENOENT; if (glops->go_flags & GLOF_ASPACE) { struct gfs2_glock_aspace *gla = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_NOFS); if (!gla) return -ENOMEM; gl = &gla->glock; } else { gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_NOFS); if (!gl) return -ENOMEM; } memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); gl->gl_ops = glops; if (glops->go_flags & GLOF_LVB) { gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { gfs2_glock_dealloc(&gl->gl_rcu); return -ENOMEM; } } atomic_inc(&sdp->sd_glock_disposal); gl->gl_node.next = NULL; gl->gl_flags = BIT(GLF_INITIAL); if (glops->go_instantiate) gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED); gl->gl_name = name; lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); gl->gl_lockref.count = 1; gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; gl->gl_dstamp = 0; preempt_disable(); /* We use the global stats to estimate the initial per-glock stats */ gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; preempt_enable(); gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); if (gl->gl_name.ln_type == LM_TYPE_IOPEN) INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { mapping->a_ops = &gfs2_meta_aops; mapping->host = s->s_bdev->bd_mapping->host; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->i_private_data = NULL; mapping->writeback_index = 0; } tmp = find_insert_glock(&name, gl); if (tmp) { gfs2_glock_dealloc(&gl->gl_rcu); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); if (IS_ERR(tmp)) return PTR_ERR(tmp); gl = tmp; } found: *glp = gl; return 0; } /** * __gfs2_holder_init - initialize a struct gfs2_holder in the default way * @gl: the glock * @state: the state we're requesting * @flags: the modifier flags * @gh: the holder structure * */ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh, unsigned long ip) { INIT_LIST_HEAD(&gh->gh_list); gh->gh_gl = gfs2_glock_hold(gl); gh->gh_ip = ip; gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; } /** * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it * @state: the state we're requesting * @flags: the modifier flags * @gh: the holder structure * * Don't mess with the glock. * */ void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh) { gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; gh->gh_ip = _RET_IP_; put_pid(gh->gh_owner_pid); gh->gh_owner_pid = get_pid(task_pid(current)); } /** * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) * @gh: the holder structure * */ void gfs2_holder_uninit(struct gfs2_holder *gh) { put_pid(gh->gh_owner_pid); gfs2_glock_put(gh->gh_gl); gfs2_holder_mark_uninitialized(gh); gh->gh_ip = 0; } static void gfs2_glock_update_hold_time(struct gfs2_glock *gl, unsigned long start_time) { /* Have we waited longer that a second? */ if (time_after(jiffies, start_time + HZ)) { /* Lengthen the minimum hold time. */ gl->gl_hold_time = min(gl->gl_hold_time + GL_GLOCK_HOLD_INCR, GL_GLOCK_MAX_HOLD); } } /** * gfs2_glock_holder_ready - holder is ready and its error code can be collected * @gh: the glock holder * * Called when a glock holder no longer needs to be waited for because it is * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has * failed (gh_error != 0). */ int gfs2_glock_holder_ready(struct gfs2_holder *gh) { if (gh->gh_error || (gh->gh_flags & GL_SKIP)) return gh->gh_error; gh->gh_error = gfs2_instantiate(gh); if (gh->gh_error) gfs2_glock_dq(gh); return gh->gh_error; } /** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * * Returns: 0 on success */ int gfs2_glock_wait(struct gfs2_holder *gh) { unsigned long start_time = jiffies; might_sleep(); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); gfs2_glock_update_hold_time(gh->gh_gl, start_time); return gfs2_glock_holder_ready(gh); } static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs) { int i; for (i = 0; i < num_gh; i++) if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) return 1; return 0; } /** * gfs2_glock_async_wait - wait on multiple asynchronous glock acquisitions * @num_gh: the number of holders in the array * @ghs: the glock holder array * * Returns: 0 on success, meaning all glocks have been granted and are held. * -ESTALE if the request timed out, meaning all glocks were released, * and the caller should retry the operation. */ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) { struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd; int i, ret = 0, timeout = 0; unsigned long start_time = jiffies; might_sleep(); /* * Total up the (minimum hold time * 2) of all glocks and use that to * determine the max amount of time we should wait. */ for (i = 0; i < num_gh; i++) timeout += ghs[i].gh_gl->gl_hold_time << 1; if (!wait_event_timeout(sdp->sd_async_glock_wait, !glocks_pending(num_gh, ghs), timeout)) { ret = -ESTALE; /* request timed out. */ goto out; } for (i = 0; i < num_gh; i++) { struct gfs2_holder *gh = &ghs[i]; int ret2; if (test_bit(HIF_HOLDER, &gh->gh_iflags)) { gfs2_glock_update_hold_time(gh->gh_gl, start_time); } ret2 = gfs2_glock_holder_ready(gh); if (!ret) ret = ret2; } out: if (ret) { for (i = 0; i < num_gh; i++) { struct gfs2_holder *gh = &ghs[i]; gfs2_glock_dq(gh); } } return ret; } /** * request_demote - process a demote request * @gl: the glock * @state: the state the caller wants us to change to * @delay: zero to demote immediately; otherwise pending demote * @remote: true if this came from a different cluster node * * There are only two requests that we are going to see in actual * practise: LM_ST_SHARED and LM_ST_UNLOCKED */ static void request_demote(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote) { if (delay) set_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); else gfs2_set_demote(gl); if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; } else if (gl->gl_demote_state != LM_ST_UNLOCKED && gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; } if (gl->gl_ops->go_callback) gl->gl_ops->go_callback(gl, remote); trace_gfs2_demote_rq(gl, remote); } void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); if (seq) { seq_vprintf(seq, fmt, args); } else { vaf.fmt = fmt; vaf.va = &args; pr_err("%pV", &vaf); } va_end(args); } static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) return true; if (gh->gh_state == LM_ST_UNLOCKED) return true; return false; } /** * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add * * Eventually we should move the recursive locking trap to a * debugging option or something like that. This is the fast * path and needs to have the minimum number of distractions. * */ static inline void add_to_queue(struct gfs2_holder *gh) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct list_head *insert_pt = NULL; struct gfs2_holder *gh2; int try_futile = 0; GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) GLOCK_BUG_ON(gl, true); if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { if (test_bit(GLF_LOCK, &gl->gl_flags)) { struct gfs2_holder *current_gh; current_gh = find_first_holder(gl); try_futile = !may_grant(gl, current_gh, gh); } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) goto fail; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (likely(gh2->gh_owner_pid != gh->gh_owner_pid)) continue; if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) continue; if (!pid_is_meaningful(gh2)) continue; goto trap_recursive; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (try_futile && !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { fail: gh->gh_error = GLR_TRYFAILED; gfs2_holder_wake(gh); return; } if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) continue; } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); return; } list_add_tail(&gh->gh_list, insert_pt); spin_unlock(&gl->gl_lockref.lock); if (sdp->sd_lockstruct.ls_ops->lm_cancel) sdp->sd_lockstruct.ls_ops->lm_cancel(gl); spin_lock(&gl->gl_lockref.lock); return; trap_recursive: fs_err(sdp, "original: %pSR\n", (void *)gh2->gh_ip); fs_err(sdp, "pid: %d\n", pid_nr(gh2->gh_owner_pid)); fs_err(sdp, "lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); fs_err(sdp, "new: %pSR\n", (void *)gh->gh_ip); fs_err(sdp, "pid: %d\n", pid_nr(gh->gh_owner_pid)); fs_err(sdp, "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); gfs2_dump_glock(NULL, gl, true); BUG(); } /** * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock) * @gh: the holder structure * * if (gh->gh_flags & GL_ASYNC), this never returns an error * * Returns: 0, GLR_TRYFAILED, or errno on failure */ int gfs2_glock_nq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; int error; if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP)) return -EIO; if (gh->gh_flags & GL_NOBLOCK) { struct gfs2_holder *current_gh; error = -ECHILD; spin_lock(&gl->gl_lockref.lock); if (find_last_waiter(gl)) goto unlock; current_gh = find_first_holder(gl); if (!may_grant(gl, current_gh, gh)) goto unlock; set_bit(HIF_HOLDER, &gh->gh_iflags); list_add_tail(&gh->gh_list, &gl->gl_holders); trace_gfs2_promote(gh); error = 0; unlock: spin_unlock(&gl->gl_lockref.lock); return error; } gh->gh_error = 0; spin_lock(&gl->gl_lockref.lock); add_to_queue(gh); if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && test_and_clear_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags))) { set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); } run_queue(gl, 1); spin_unlock(&gl->gl_lockref.lock); error = 0; if (!(gh->gh_flags & GL_ASYNC)) error = gfs2_glock_wait(gh); return error; } /** * gfs2_glock_poll - poll to see if an async request has been completed * @gh: the holder * * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on */ int gfs2_glock_poll(struct gfs2_holder *gh) { return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; } static inline bool needs_demote(struct gfs2_glock *gl) { return (test_bit(GLF_DEMOTE, &gl->gl_flags) || test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)); } static void __gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; unsigned delay = 0; int fast_path = 0; /* * This holder should not be cached, so mark it for demote. * Note: this should be done before the check for needs_demote * below. */ if (gh->gh_flags & GL_NOCACHE) request_demote(gl, LM_ST_UNLOCKED, 0, false); list_del_init(&gh->gh_list); clear_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_glock_queue(gh, 0); /* * If there hasn't been a demote request we are done. * (Let the remaining holders, if any, keep holding it.) */ if (!needs_demote(gl)) { if (list_empty(&gl->gl_holders)) fast_path = 1; } if (unlikely(!fast_path)) { gl->gl_lockref.count++; if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && !test_bit(GLF_DEMOTE, &gl->gl_flags) && gl->gl_name.ln_type == LM_TYPE_INODE) delay = gl->gl_hold_time; gfs2_glock_queue_work(gl, delay); } } /** * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) * @gh: the glock holder * */ void gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; spin_lock(&gl->gl_lockref.lock); if (!gfs2_holder_queued(gh)) { /* * May have already been dequeued because the locking request * was GL_ASYNC and it has failed in the meantime. */ goto out; } if (list_is_first(&gh->gh_list, &gl->gl_holders) && !test_bit(HIF_HOLDER, &gh->gh_iflags)) { spin_unlock(&gl->gl_lockref.lock); gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); spin_lock(&gl->gl_lockref.lock); } /* * If we're in the process of file system withdraw, we cannot just * dequeue any glocks until our journal is recovered, lest we introduce * file system corruption. We need two exceptions to this rule: We need * to allow unlocking of nondisk glocks and the glock for our own * journal that needs recovery. */ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && glock_blocked_by_withdraw(gl) && gh->gh_gl != sdp->sd_jinode_gl) { sdp->sd_glock_dqs_held++; spin_unlock(&gl->gl_lockref.lock); might_sleep(); wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, TASK_UNINTERRUPTIBLE); spin_lock(&gl->gl_lockref.lock); } __gfs2_glock_dq(gh); out: spin_unlock(&gl->gl_lockref.lock); } void gfs2_glock_dq_wait(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); } /** * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it * @gh: the holder structure * */ void gfs2_glock_dq_uninit(struct gfs2_holder *gh) { gfs2_glock_dq(gh); gfs2_holder_uninit(gh); } /** * gfs2_glock_nq_num - acquire a glock based on lock number * @sdp: the filesystem * @number: the lock number * @glops: the glock operations for the type of glock * @state: the state to acquire the glock in * @flags: modifier flags for the acquisition * @gh: the struct gfs2_holder * * Returns: errno */ int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, unsigned int state, u16 flags, struct gfs2_holder *gh) { struct gfs2_glock *gl; int error; error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); if (!error) { error = gfs2_glock_nq_init(gl, state, flags, gh); gfs2_glock_put(gl); } return error; } /** * glock_compare - Compare two struct gfs2_glock structures for sorting * @arg_a: the first structure * @arg_b: the second structure * */ static int glock_compare(const void *arg_a, const void *arg_b) { const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a; const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b; const struct lm_lockname *a = &gh_a->gh_gl->gl_name; const struct lm_lockname *b = &gh_b->gh_gl->gl_name; if (a->ln_number > b->ln_number) return 1; if (a->ln_number < b->ln_number) return -1; BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type); return 0; } /** * nq_m_sync - synchronously acquire more than one glock in deadlock free order * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * @p: placeholder for the holder structure to pass back * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) */ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, struct gfs2_holder **p) { unsigned int x; int error = 0; for (x = 0; x < num_gh; x++) p[x] = &ghs[x]; sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); for (x = 0; x < num_gh; x++) { error = gfs2_glock_nq(p[x]); if (error) { while (x--) gfs2_glock_dq(p[x]); break; } } return error; } /** * gfs2_glock_nq_m - acquire multiple glocks * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) */ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) { struct gfs2_holder *tmp[4]; struct gfs2_holder **pph = tmp; int error = 0; switch(num_gh) { case 0: return 0; case 1: return gfs2_glock_nq(ghs); default: if (num_gh <= 4) break; pph = kmalloc_array(num_gh, sizeof(struct gfs2_holder *), GFP_NOFS); if (!pph) return -ENOMEM; } error = nq_m_sync(num_gh, ghs, pph); if (pph != tmp) kfree(pph); return error; } /** * gfs2_glock_dq_m - release multiple glocks * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * */ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) { while (num_gh--) gfs2_glock_dq(&ghs[num_gh]); } void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { unsigned long delay = 0; unsigned long holdtime; unsigned long now = jiffies; gfs2_glock_hold(gl); spin_lock(&gl->gl_lockref.lock); holdtime = gl->gl_tchange + gl->gl_hold_time; if (!list_empty(&gl->gl_holders) && gl->gl_name.ln_type == LM_TYPE_INODE) { if (time_before(now, holdtime)) delay = holdtime - now; if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags)) delay = gl->gl_hold_time; } request_demote(gl, state, delay, true); gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); } /** * gfs2_should_freeze - Figure out if glock should be frozen * @gl: The glock in question * * Glocks are not frozen if (a) the result of the dlm operation is * an error, (b) the locking operation was an unlock operation or * (c) if there is a "noexp" flagged request anywhere in the queue * * Returns: 1 if freezing should occur, 0 otherwise */ static int gfs2_should_freeze(const struct gfs2_glock *gl) { const struct gfs2_holder *gh; if (gl->gl_reply & ~LM_OUT_ST_MASK) return 0; if (gl->gl_target == LM_ST_UNLOCKED) return 0; list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (LM_FLAG_NOEXP & gh->gh_flags) return 0; } return 1; } /** * gfs2_glock_complete - Callback used by locking * @gl: Pointer to the glock * @ret: The return value from the dlm * * The gl_reply field is under the gl_lockref.lock lock so that it is ok * to use a bitfield shared with other glock state fields. */ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; spin_lock(&gl->gl_lockref.lock); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { if (gfs2_should_freeze(gl)) { set_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); return; } } gl->gl_lockref.count++; set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } static int glock_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct gfs2_glock *gla, *glb; gla = list_entry(a, struct gfs2_glock, gl_lru); glb = list_entry(b, struct gfs2_glock, gl_lru); if (gla->gl_name.ln_number > glb->gl_name.ln_number) return 1; if (gla->gl_name.ln_number < glb->gl_name.ln_number) return -1; return 0; } static bool can_free_glock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; return !test_bit(GLF_LOCK, &gl->gl_flags) && !gl->gl_lockref.count && (!test_bit(GLF_LFLUSH, &gl->gl_flags) || test_bit(SDF_KILL, &sdp->sd_flags)); } /** * gfs2_dispose_glock_lru - Demote a list of glocks * @list: The list to dispose of * * Disposing of glocks may involve disk accesses, so that here we sort * the glocks by number (i.e. disk location of the inodes) so that if * there are any such accesses, they'll be sent in order (mostly). * * Must be called under the lru_lock, but may drop and retake this * lock. While the lru_lock is dropped, entries may vanish from the * list, but no new entries will appear on the list (since it is * private) */ static unsigned long gfs2_dispose_glock_lru(struct list_head *list) __releases(&lru_lock) __acquires(&lru_lock) { struct gfs2_glock *gl; unsigned long freed = 0; list_sort(NULL, list, glock_cmp); while(!list_empty(list)) { gl = list_first_entry(list, struct gfs2_glock, gl_lru); if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_move(&gl->gl_lru, &lru_list); continue; } if (!can_free_glock(gl)) { spin_unlock(&gl->gl_lockref.lock); goto add_back_to_lru; } list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); freed++; gl->gl_lockref.count++; if (gl->gl_state != LM_ST_UNLOCKED) request_demote(gl, LM_ST_UNLOCKED, 0, false); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); cond_resched_lock(&lru_lock); } return freed; } /** * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote * @nr: The number of entries to scan * * This function selects the entries on the LRU which are able to * be demoted, and then kicks off the process by calling * gfs2_dispose_glock_lru() above. */ static unsigned long gfs2_scan_glock_lru(unsigned long nr) { struct gfs2_glock *gl, *next; LIST_HEAD(dispose); unsigned long freed = 0; spin_lock(&lru_lock); list_for_each_entry_safe(gl, next, &lru_list, gl_lru) { if (!nr--) break; if (can_free_glock(gl)) list_move(&gl->gl_lru, &dispose); } if (!list_empty(&dispose)) freed = gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); return freed; } static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; return gfs2_scan_glock_lru(sc->nr_to_scan); } static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio(atomic_read(&lru_count)); } static struct shrinker *glock_shrinker; /** * glock_hash_walk - Call a function for glock in a hash bucket * @examiner: the function * @sdp: the filesystem * * Note that the function can be called multiple times on the same * object. So the user must ensure that the function can cope with * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; struct rhashtable_iter iter; rhashtable_walk_enter(&gl_hash_table, &iter); do { rhashtable_walk_start(&iter); while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) { if (gl->gl_name.ln_sbd == sdp) examiner(gl); } rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); rhashtable_walk_exit(&iter); } void gfs2_cancel_delete_work(struct gfs2_glock *gl) { clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags); clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags); if (cancel_delayed_work(&gl->gl_delete)) gfs2_glock_put(gl); } static void flush_delete_work(struct gfs2_glock *gl) { if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (cancel_delayed_work(&gl->gl_delete)) { queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } } } void gfs2_flush_delete_work(struct gfs2_sbd *sdp) { glock_hash_walk(flush_delete_work, sdp); flush_workqueue(sdp->sd_delete_wq); } /** * thaw_glock - thaw out a glock which has an unprocessed reply waiting * @gl: The glock to thaw * */ static void thaw_glock(struct gfs2_glock *gl) { if (!test_and_clear_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags)) return; if (!lockref_get_not_dead(&gl->gl_lockref)) return; gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_lockref.lock); set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } /** * clear_glock - look at a glock and see if we can free it from glock cache * @gl: the glock to look at * */ static void clear_glock(struct gfs2_glock *gl) { gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_lockref.lock); if (!__lockref_is_dead(&gl->gl_lockref)) { gl->gl_lockref.count++; if (gl->gl_state != LM_ST_UNLOCKED) request_demote(gl, LM_ST_UNLOCKED, 0, false); gfs2_glock_queue_work(gl, 0); } spin_unlock(&gl->gl_lockref.lock); } /** * gfs2_glock_thaw - Thaw any frozen glocks * @sdp: The super block * */ void gfs2_glock_thaw(struct gfs2_sbd *sdp) { glock_hash_walk(thaw_glock, sdp); } static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) { spin_lock(&gl->gl_lockref.lock); gfs2_dump_glock(seq, gl, fsid); spin_unlock(&gl->gl_lockref.lock); } static void dump_glock_func(struct gfs2_glock *gl) { dump_glock(NULL, gl, true); } static void withdraw_dq(struct gfs2_glock *gl) { spin_lock(&gl->gl_lockref.lock); if (!__lockref_is_dead(&gl->gl_lockref) && glock_blocked_by_withdraw(gl)) do_error(gl, LM_OUT_ERROR); /* remove pending waiters */ spin_unlock(&gl->gl_lockref.lock); } void gfs2_gl_dq_holders(struct gfs2_sbd *sdp) { glock_hash_walk(withdraw_dq, sdp); } /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem * * Called when unmounting the filesystem. */ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { unsigned long start = jiffies; bool timed_out = false; set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); flush_workqueue(sdp->sd_glock_wq); glock_hash_walk(clear_glock, sdp); flush_workqueue(sdp->sd_glock_wq); while (!timed_out) { wait_event_timeout(sdp->sd_kill_wait, !atomic_read(&sdp->sd_glock_disposal), HZ * 60); if (!atomic_read(&sdp->sd_glock_disposal)) break; timed_out = time_after(jiffies, start + (HZ * 600)); fs_warn(sdp, "%u glocks left after %u seconds%s\n", atomic_read(&sdp->sd_glock_disposal), jiffies_to_msecs(jiffies - start) / 1000, timed_out ? ":" : "; still waiting"); } gfs2_lm_unmount(sdp); gfs2_free_dead_glocks(sdp); glock_hash_walk(dump_glock_func, sdp); destroy_workqueue(sdp->sd_glock_wq); } static const char *state2str(unsigned state) { switch(state) { case LM_ST_UNLOCKED: return "UN"; case LM_ST_SHARED: return "SH"; case LM_ST_DEFERRED: return "DF"; case LM_ST_EXCLUSIVE: return "EX"; } return "??"; } static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) { char *p = buf; if (flags & LM_FLAG_TRY) *p++ = 't'; if (flags & LM_FLAG_TRY_1CB) *p++ = 'T'; if (flags & LM_FLAG_NOEXP) *p++ = 'e'; if (flags & LM_FLAG_ANY) *p++ = 'A'; if (flags & LM_FLAG_NODE_SCOPE) *p++ = 'n'; if (flags & GL_ASYNC) *p++ = 'a'; if (flags & GL_EXACT) *p++ = 'E'; if (flags & GL_NOCACHE) *p++ = 'c'; if (test_bit(HIF_HOLDER, &iflags)) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; if (flags & GL_SKIP) *p++ = 's'; *p = 0; return buf; } /** * dump_holder - print information about a glock holder * @seq: the seq_file struct * @gh: the glock holder * @fs_id_buf: pointer to file system id (if requested) * */ static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh, const char *fs_id_buf) { const char *comm = "(none)"; pid_t owner_pid = 0; char flags_buf[32]; rcu_read_lock(); if (pid_is_meaningful(gh)) { struct task_struct *gh_owner; comm = "(ended)"; owner_pid = pid_nr(gh->gh_owner_pid); gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); if (gh_owner) comm = gh_owner->comm; } gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n", fs_id_buf, state2str(gh->gh_state), hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip); rcu_read_unlock(); } static const char *gflags2str(char *buf, const struct gfs2_glock *gl) { const unsigned long *gflags = &gl->gl_flags; char *p = buf; if (test_bit(GLF_LOCK, gflags)) *p++ = 'l'; if (test_bit(GLF_DEMOTE, gflags)) *p++ = 'D'; if (test_bit(GLF_PENDING_DEMOTE, gflags)) *p++ = 'd'; if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags)) *p++ = 'p'; if (test_bit(GLF_DIRTY, gflags)) *p++ = 'y'; if (test_bit(GLF_LFLUSH, gflags)) *p++ = 'f'; if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) *p++ = 'i'; if (test_bit(GLF_HAVE_REPLY, gflags)) *p++ = 'r'; if (test_bit(GLF_INITIAL, gflags)) *p++ = 'a'; if (test_bit(GLF_HAVE_FROZEN_REPLY, gflags)) *p++ = 'F'; if (!list_empty(&gl->gl_holders)) *p++ = 'q'; if (test_bit(GLF_LRU, gflags)) *p++ = 'L'; if (gl->gl_object) *p++ = 'o'; if (test_bit(GLF_BLOCKING, gflags)) *p++ = 'b'; if (test_bit(GLF_UNLOCKED, gflags)) *p++ = 'x'; if (test_bit(GLF_INSTANTIATE_NEEDED, gflags)) *p++ = 'n'; if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags)) *p++ = 'N'; if (test_bit(GLF_TRY_TO_EVICT, gflags)) *p++ = 'e'; if (test_bit(GLF_VERIFY_EVICT, gflags)) *p++ = 'E'; *p = 0; return buf; } /** * gfs2_dump_glock - print information about a glock * @seq: The seq_file struct * @gl: the glock * @fsid: If true, also dump the file system id * * The file format is as follows: * One line per object, capital letters are used to indicate objects * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented, * other objects are indented by a single space and follow the glock to * which they are related. Fields are indicated by lower case letters * followed by a colon and the field value, except for strings which are in * [] so that its possible to see if they are composed of spaces for * example. The field's are n = number (id of the object), f = flags, * t = type, s = state, r = refcount, e = error, p = pid. * */ void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; const struct gfs2_holder *gh; char gflags_buf[32]; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; unsigned long nrpages = 0; if (gl->gl_ops->go_flags & GLOF_ASPACE) { struct address_space *mapping = gfs2_glock2aspace(gl); nrpages = mapping->nrpages; } memset(fs_id_buf, 0, sizeof(fs_id_buf)); if (fsid && sdp) /* safety precaution */ sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); dtime = jiffies - gl->gl_demote_time; dtime *= 1000000/HZ; /* demote time in uSec */ if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; gfs2_print_dbg(seq, "%sG: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d " "v:%d r:%d m:%ld p:%lu\n", fs_id_buf, state2str(gl->gl_state), gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, gflags2str(gflags_buf, gl), state2str(gl->gl_target), state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_revokes), (int)gl->gl_lockref.count, gl->gl_hold_time, nrpages); list_for_each_entry(gh, &gl->gl_holders, gh_list) dump_holder(seq, gh, fs_id_buf); if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) glops->go_dump(seq, gl, fs_id_buf); } static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock *gl = iter_ptr; seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); return 0; } static const char *gfs2_gltype[] = { "type", "reserved", "nondisk", "inode", "rgrp", "meta", "iopen", "flock", "plock", "quota", "journal", }; static const char *gfs2_stype[] = { [GFS2_LKS_SRTT] = "srtt", [GFS2_LKS_SRTTVAR] = "srttvar", [GFS2_LKS_SRTTB] = "srttb", [GFS2_LKS_SRTTVARB] = "srttvarb", [GFS2_LKS_SIRT] = "sirt", [GFS2_LKS_SIRTVAR] = "sirtvar", [GFS2_LKS_DCOUNT] = "dlm", [GFS2_LKS_QCOUNT] = "queue", }; #define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype)) static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_sbd *sdp = seq->private; loff_t pos = *(loff_t *)iter_ptr; unsigned index = pos >> 3; unsigned subindex = pos & 0x07; int i; if (index == 0 && subindex != 0) return 0; seq_printf(seq, "%-10s %8s:", gfs2_gltype[index], (index == 0) ? "cpu": gfs2_stype[subindex]); for_each_possible_cpu(i) { const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); if (index == 0) seq_printf(seq, " %15u", i); else seq_printf(seq, " %15llu", (unsigned long long)lkstats-> lkstats[index - 1].stats[subindex]); } seq_putc(seq, '\n'); return 0; } int __init gfs2_glock_init(void) { int i, ret; ret = rhashtable_init(&gl_hash_table, &ht_parms); if (ret < 0) return ret; glock_shrinker = shrinker_alloc(0, "gfs2-glock"); if (!glock_shrinker) { rhashtable_destroy(&gl_hash_table); return -ENOMEM; } glock_shrinker->count_objects = gfs2_glock_shrink_count; glock_shrinker->scan_objects = gfs2_glock_shrink_scan; shrinker_register(glock_shrinker); for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) init_waitqueue_head(glock_wait_table + i); return 0; } void gfs2_glock_exit(void) { shrinker_free(glock_shrinker); rhashtable_destroy(&gl_hash_table); } static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) { struct gfs2_glock *gl = gi->gl; if (gl) { if (n == 0) return; gfs2_glock_put_async(gl); } for (;;) { gl = rhashtable_walk_next(&gi->hti); if (IS_ERR_OR_NULL(gl)) { if (gl == ERR_PTR(-EAGAIN)) { n = 1; continue; } gl = NULL; break; } if (gl->gl_name.ln_sbd != gi->sdp) continue; if (n <= 1) { if (!lockref_get_not_dead(&gl->gl_lockref)) continue; break; } else { if (__lockref_is_dead(&gl->gl_lockref)) continue; n--; } } gi->gl = gl; } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; loff_t n; /* * We can either stay where we are, skip to the next hash table * entry, or start from the beginning. */ if (*pos < gi->last_pos) { rhashtable_walk_exit(&gi->hti); rhashtable_walk_enter(&gl_hash_table, &gi->hti); n = *pos + 1; } else { n = *pos - gi->last_pos; } rhashtable_walk_start(&gi->hti); gfs2_glock_iter_next(gi, n); gi->last_pos = *pos; return gi->gl; } static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi, 1); return gi->gl; } static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) __releases(RCU) { struct gfs2_glock_iter *gi = seq->private; rhashtable_walk_stop(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { dump_glock(seq, iter_ptr, false); return 0; } static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) { preempt_disable(); if (*pos >= GFS2_NR_SBSTATS) return NULL; return pos; } static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { (*pos)++; if (*pos >= GFS2_NR_SBSTATS) return NULL; return pos; } static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) { preempt_enable(); } static const struct seq_operations gfs2_glock_seq_ops = { .start = gfs2_glock_seq_start, .next = gfs2_glock_seq_next, .stop = gfs2_glock_seq_stop, .show = gfs2_glock_seq_show, }; static const struct seq_operations gfs2_glstats_seq_ops = { .start = gfs2_glock_seq_start, .next = gfs2_glock_seq_next, .stop = gfs2_glock_seq_stop, .show = gfs2_glstats_seq_show, }; static const struct seq_operations gfs2_sbstats_sops = { .start = gfs2_sbstats_seq_start, .next = gfs2_sbstats_seq_next, .stop = gfs2_sbstats_seq_stop, .show = gfs2_sbstats_seq_show, }; #define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) static int __gfs2_glocks_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter)); if (ret == 0) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; /* * Initially, we are "before" the first hash table entry; the * first call to rhashtable_walk_next gets us the first entry. */ gi->last_pos = -1; gi->gl = NULL; rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } static int gfs2_glocks_open(struct inode *inode, struct file *file) { return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops); } static int gfs2_glocks_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; if (gi->gl) gfs2_glock_put(gi->gl); rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } static int gfs2_glstats_open(struct inode *inode, struct file *file) { return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } static const struct file_operations gfs2_glocks_fops = { .owner = THIS_MODULE, .open = gfs2_glocks_open, .read = seq_read, .llseek = seq_lseek, .release = gfs2_glocks_release, }; static const struct file_operations gfs2_glstats_fops = { .owner = THIS_MODULE, .open = gfs2_glstats_open, .read = seq_read, .llseek = seq_lseek, .release = gfs2_glocks_release, }; struct gfs2_glockfd_iter { struct super_block *sb; unsigned int tgid; struct task_struct *task; unsigned int fd; struct file *file; }; static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i) { struct pid_namespace *ns = task_active_pid_ns(current); struct pid *pid; if (i->task) put_task_struct(i->task); rcu_read_lock(); retry: i->task = NULL; pid = find_ge_pid(i->tgid, ns); if (pid) { i->tgid = pid_nr_ns(pid, ns); i->task = pid_task(pid, PIDTYPE_TGID); if (!i->task) { i->tgid++; goto retry; } get_task_struct(i->task); } rcu_read_unlock(); return i->task; } static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) { if (i->file) { fput(i->file); i->file = NULL; } rcu_read_lock(); for(;; i->fd++) { struct inode *inode; i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } inode = file_inode(i->file); if (inode->i_sb == i->sb) break; rcu_read_unlock(); fput(i->file); rcu_read_lock(); } rcu_read_unlock(); return i->file; } static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glockfd_iter *i = seq->private; if (*pos) return NULL; while (gfs2_glockfd_next_task(i)) { if (gfs2_glockfd_next_file(i)) return i; i->tgid++; } return NULL; } static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { struct gfs2_glockfd_iter *i = seq->private; (*pos)++; i->fd++; do { if (gfs2_glockfd_next_file(i)) return i; i->tgid++; } while (gfs2_glockfd_next_task(i)); return NULL; } static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glockfd_iter *i = seq->private; if (i->file) fput(i->file); if (i->task) put_task_struct(i->task); } static void gfs2_glockfd_seq_show_flock(struct seq_file *seq, struct gfs2_glockfd_iter *i) { struct gfs2_file *fp = i->file->private_data; struct gfs2_holder *fl_gh = &fp->f_fl_gh; struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED }; if (!READ_ONCE(fl_gh->gh_gl)) return; spin_lock(&i->file->f_lock); if (gfs2_holder_initialized(fl_gh)) gl_name = fl_gh->gh_gl->gl_name; spin_unlock(&i->file->f_lock); if (gl_name.ln_type != LM_TYPE_RESERVED) { seq_printf(seq, "%d %u %u/%llx\n", i->tgid, i->fd, gl_name.ln_type, (unsigned long long)gl_name.ln_number); } } static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_glockfd_iter *i = seq->private; struct inode *inode = file_inode(i->file); struct gfs2_glock *gl; inode_lock_shared(inode); gl = GFS2_I(inode)->i_iopen_gh.gh_gl; if (gl) { seq_printf(seq, "%d %u %u/%llx\n", i->tgid, i->fd, gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); } gfs2_glockfd_seq_show_flock(seq, i); inode_unlock_shared(inode); return 0; } static const struct seq_operations gfs2_glockfd_seq_ops = { .start = gfs2_glockfd_seq_start, .next = gfs2_glockfd_seq_next, .stop = gfs2_glockfd_seq_stop, .show = gfs2_glockfd_seq_show, }; static int gfs2_glockfd_open(struct inode *inode, struct file *file) { struct gfs2_glockfd_iter *i; struct gfs2_sbd *sdp = inode->i_private; i = __seq_open_private(file, &gfs2_glockfd_seq_ops, sizeof(struct gfs2_glockfd_iter)); if (!i) return -ENOMEM; i->sb = sdp->sd_vfs; return 0; } static const struct file_operations gfs2_glockfd_fops = { .owner = THIS_MODULE, .open = gfs2_glockfd_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) { sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root); debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glocks_fops); debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glockfd_fops); debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glstats_fops); debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_sbstats_fops); } void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) { debugfs_remove_recursive(sdp->debugfs_dir); sdp->debugfs_dir = NULL; } void gfs2_register_debugfs(void) { gfs2_root = debugfs_create_dir("gfs2", NULL); } void gfs2_unregister_debugfs(void) { debugfs_remove(gfs2_root); gfs2_root = NULL; }
5 5 4 5 5 5 4 5 5 169 171 42 4 4 43 38 23 1 22 23 22 247 243 1 2 1 1 166 167 73 1 1 166 23 23 10 10 6 10 7 1 5 7 59 17 2 2 2 2 2 2 1 3 14 17 14 13 13 11 1 5 4 7 86 2 1 3 87 60 60 306 73 73 71 3 73 4 3 3 1 1 5 73 73 74 233 245 172 66 172 11 11 6 172 168 167 64 179 63 62 62 63 62 62 63 5 5 5 5 5 5 5 295 5 285 290 285 278 258 279 295 283 283 53 75 75 166 166 166 166 165 166 166 166 166 166 166 166 73 73 73 73 166 166 166 164 165 166 165 166 164 165 166 166 166 166 165 166 165 166 3 166 166 164 166 166 22 22 166 166 166 166 166 166 166 10 166 166 165 166 166 22 166 166 166 166 166 65 22 65 2 65 65 3 65 166 166 166 166 11 11 10 11 1 3 3 1 1 1 1 1 1 1 14 14 14 14 14 9 9 9 9 8 9 9 9 2 2 2 2 2 2 74 74 72 74 3 74 71 44 74 76 76 76 10 76 76 76 76 73 72 73 6 73 2 73 3 9 9 7 9 8 35 34 7 35 35 156 156 154 154 154 154 154 154 154 154 154 156 162 162 156 162 10 10 6 5 6 1 6 10 10 10 10 10 10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 // SPDX-License-Identifier: GPL-2.0-or-later /* * V4L2 controls framework core implementation. * * Copyright (C) 2010-2021 Hans Verkuil <hverkuil-cisco@xs4all.nl> */ #include <linux/export.h> #include <linux/mm.h> #include <linux/slab.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/v4l2-fwnode.h> #include "v4l2-ctrls-priv.h" static const union v4l2_ctrl_ptr ptr_null; static void fill_event(struct v4l2_event *ev, struct v4l2_ctrl *ctrl, u32 changes) { memset(ev, 0, sizeof(*ev)); ev->type = V4L2_EVENT_CTRL; ev->id = ctrl->id; ev->u.ctrl.changes = changes; ev->u.ctrl.type = ctrl->type; ev->u.ctrl.flags = user_flags(ctrl); if (ctrl->is_ptr) ev->u.ctrl.value64 = 0; else ev->u.ctrl.value64 = *ctrl->p_cur.p_s64; ev->u.ctrl.minimum = ctrl->minimum; ev->u.ctrl.maximum = ctrl->maximum; if (ctrl->type == V4L2_CTRL_TYPE_MENU || ctrl->type == V4L2_CTRL_TYPE_INTEGER_MENU) ev->u.ctrl.step = 1; else ev->u.ctrl.step = ctrl->step; ev->u.ctrl.default_value = ctrl->default_value; } void send_initial_event(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl) { struct v4l2_event ev; u32 changes = V4L2_EVENT_CTRL_CH_FLAGS; if (!(ctrl->flags & V4L2_CTRL_FLAG_WRITE_ONLY)) changes |= V4L2_EVENT_CTRL_CH_VALUE; fill_event(&ev, ctrl, changes); v4l2_event_queue_fh(fh, &ev); } void send_event(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl, u32 changes) { struct v4l2_event ev; struct v4l2_subscribed_event *sev; if (list_empty(&ctrl->ev_subs)) return; fill_event(&ev, ctrl, changes); list_for_each_entry(sev, &ctrl->ev_subs, node) if (sev->fh != fh || (sev->flags & V4L2_EVENT_SUB_FL_ALLOW_FEEDBACK)) v4l2_event_queue_fh(sev->fh, &ev); } bool v4l2_ctrl_type_op_equal(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr1, union v4l2_ctrl_ptr ptr2) { unsigned int i; switch (ctrl->type) { case V4L2_CTRL_TYPE_BUTTON: return false; case V4L2_CTRL_TYPE_STRING: for (i = 0; i < ctrl->elems; i++) { unsigned int idx = i * ctrl->elem_size; /* strings are always 0-terminated */ if (strcmp(ptr1.p_char + idx, ptr2.p_char + idx)) return false; } return true; default: return !memcmp(ptr1.p_const, ptr2.p_const, ctrl->elems * ctrl->elem_size); } } EXPORT_SYMBOL(v4l2_ctrl_type_op_equal); /* Default intra MPEG-2 quantisation coefficients, from the specification. */ static const u8 mpeg2_intra_quant_matrix[64] = { 8, 16, 16, 19, 16, 19, 22, 22, 22, 22, 22, 22, 26, 24, 26, 27, 27, 27, 26, 26, 26, 26, 27, 27, 27, 29, 29, 29, 34, 34, 34, 29, 29, 29, 27, 27, 29, 29, 32, 32, 34, 34, 37, 38, 37, 35, 35, 34, 35, 38, 38, 40, 40, 40, 48, 48, 46, 46, 56, 56, 58, 69, 69, 83 }; static void std_init_compound(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr) { struct v4l2_ctrl_mpeg2_sequence *p_mpeg2_sequence; struct v4l2_ctrl_mpeg2_picture *p_mpeg2_picture; struct v4l2_ctrl_mpeg2_quantisation *p_mpeg2_quant; struct v4l2_ctrl_vp8_frame *p_vp8_frame; struct v4l2_ctrl_vp9_frame *p_vp9_frame; struct v4l2_ctrl_fwht_params *p_fwht_params; struct v4l2_ctrl_h264_scaling_matrix *p_h264_scaling_matrix; struct v4l2_ctrl_av1_sequence *p_av1_sequence; void *p = ptr.p + idx * ctrl->elem_size; if (ctrl->p_def.p_const) memcpy(p, ctrl->p_def.p_const, ctrl->elem_size); else memset(p, 0, ctrl->elem_size); switch ((u32)ctrl->type) { case V4L2_CTRL_TYPE_MPEG2_SEQUENCE: p_mpeg2_sequence = p; /* 4:2:0 */ p_mpeg2_sequence->chroma_format = 1; break; case V4L2_CTRL_TYPE_MPEG2_PICTURE: p_mpeg2_picture = p; /* interlaced top field */ p_mpeg2_picture->picture_structure = V4L2_MPEG2_PIC_TOP_FIELD; p_mpeg2_picture->picture_coding_type = V4L2_MPEG2_PIC_CODING_TYPE_I; break; case V4L2_CTRL_TYPE_MPEG2_QUANTISATION: p_mpeg2_quant = p; memcpy(p_mpeg2_quant->intra_quantiser_matrix, mpeg2_intra_quant_matrix, ARRAY_SIZE(mpeg2_intra_quant_matrix)); /* * The default non-intra MPEG-2 quantisation * coefficients are all 16, as per the specification. */ memset(p_mpeg2_quant->non_intra_quantiser_matrix, 16, sizeof(p_mpeg2_quant->non_intra_quantiser_matrix)); break; case V4L2_CTRL_TYPE_VP8_FRAME: p_vp8_frame = p; p_vp8_frame->num_dct_parts = 1; break; case V4L2_CTRL_TYPE_VP9_FRAME: p_vp9_frame = p; p_vp9_frame->profile = 0; p_vp9_frame->bit_depth = 8; p_vp9_frame->flags |= V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING | V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING; break; case V4L2_CTRL_TYPE_AV1_SEQUENCE: p_av1_sequence = p; p_av1_sequence->bit_depth = 8; break; case V4L2_CTRL_TYPE_FWHT_PARAMS: p_fwht_params = p; p_fwht_params->version = V4L2_FWHT_VERSION; p_fwht_params->width = 1280; p_fwht_params->height = 720; p_fwht_params->flags = V4L2_FWHT_FL_PIXENC_YUV | (2 << V4L2_FWHT_FL_COMPONENTS_NUM_OFFSET); break; case V4L2_CTRL_TYPE_H264_SCALING_MATRIX: p_h264_scaling_matrix = p; /* * The default (flat) H.264 scaling matrix when none are * specified in the bitstream, this is according to formulas * (7-8) and (7-9) of the specification. */ memset(p_h264_scaling_matrix, 16, sizeof(*p_h264_scaling_matrix)); break; } } void v4l2_ctrl_type_op_init(const struct v4l2_ctrl *ctrl, u32 from_idx, union v4l2_ctrl_ptr ptr) { unsigned int i; u32 tot_elems = ctrl->elems; u32 elems = tot_elems - from_idx; if (from_idx >= tot_elems) return; switch (ctrl->type) { case V4L2_CTRL_TYPE_STRING: for (i = from_idx; i < tot_elems; i++) { unsigned int offset = i * ctrl->elem_size; memset(ptr.p_char + offset, ' ', ctrl->minimum); ptr.p_char[offset + ctrl->minimum] = '\0'; } break; case V4L2_CTRL_TYPE_INTEGER64: if (ctrl->default_value) { for (i = from_idx; i < tot_elems; i++) ptr.p_s64[i] = ctrl->default_value; } else { memset(ptr.p_s64 + from_idx, 0, elems * sizeof(s64)); } break; case V4L2_CTRL_TYPE_INTEGER: case V4L2_CTRL_TYPE_INTEGER_MENU: case V4L2_CTRL_TYPE_MENU: case V4L2_CTRL_TYPE_BITMASK: case V4L2_CTRL_TYPE_BOOLEAN: if (ctrl->default_value) { for (i = from_idx; i < tot_elems; i++) ptr.p_s32[i] = ctrl->default_value; } else { memset(ptr.p_s32 + from_idx, 0, elems * sizeof(s32)); } break; case V4L2_CTRL_TYPE_BUTTON: case V4L2_CTRL_TYPE_CTRL_CLASS: memset(ptr.p_s32 + from_idx, 0, elems * sizeof(s32)); break; case V4L2_CTRL_TYPE_U8: memset(ptr.p_u8 + from_idx, ctrl->default_value, elems); break; case V4L2_CTRL_TYPE_U16: if (ctrl->default_value) { for (i = from_idx; i < tot_elems; i++) ptr.p_u16[i] = ctrl->default_value; } else { memset(ptr.p_u16 + from_idx, 0, elems * sizeof(u16)); } break; case V4L2_CTRL_TYPE_U32: if (ctrl->default_value) { for (i = from_idx; i < tot_elems; i++) ptr.p_u32[i] = ctrl->default_value; } else { memset(ptr.p_u32 + from_idx, 0, elems * sizeof(u32)); } break; default: for (i = from_idx; i < tot_elems; i++) std_init_compound(ctrl, i, ptr); break; } } EXPORT_SYMBOL(v4l2_ctrl_type_op_init); void v4l2_ctrl_type_op_log(const struct v4l2_ctrl *ctrl) { union v4l2_ctrl_ptr ptr = ctrl->p_cur; if (ctrl->is_array) { unsigned i; for (i = 0; i < ctrl->nr_of_dims; i++) pr_cont("[%u]", ctrl->dims[i]); pr_cont(" "); } switch (ctrl->type) { case V4L2_CTRL_TYPE_INTEGER: pr_cont("%d", *ptr.p_s32); break; case V4L2_CTRL_TYPE_BOOLEAN: pr_cont("%s", *ptr.p_s32 ? "true" : "false"); break; case V4L2_CTRL_TYPE_MENU: pr_cont("%s", ctrl->qmenu[*ptr.p_s32]); break; case V4L2_CTRL_TYPE_INTEGER_MENU: pr_cont("%lld", ctrl->qmenu_int[*ptr.p_s32]); break; case V4L2_CTRL_TYPE_BITMASK: pr_cont("0x%08x", *ptr.p_s32); break; case V4L2_CTRL_TYPE_INTEGER64: pr_cont("%lld", *ptr.p_s64); break; case V4L2_CTRL_TYPE_STRING: pr_cont("%s", ptr.p_char); break; case V4L2_CTRL_TYPE_U8: pr_cont("%u", (unsigned)*ptr.p_u8); break; case V4L2_CTRL_TYPE_U16: pr_cont("%u", (unsigned)*ptr.p_u16); break; case V4L2_CTRL_TYPE_U32: pr_cont("%u", (unsigned)*ptr.p_u32); break; case V4L2_CTRL_TYPE_AREA: pr_cont("%ux%u", ptr.p_area->width, ptr.p_area->height); break; case V4L2_CTRL_TYPE_H264_SPS: pr_cont("H264_SPS"); break; case V4L2_CTRL_TYPE_H264_PPS: pr_cont("H264_PPS"); break; case V4L2_CTRL_TYPE_H264_SCALING_MATRIX: pr_cont("H264_SCALING_MATRIX"); break; case V4L2_CTRL_TYPE_H264_SLICE_PARAMS: pr_cont("H264_SLICE_PARAMS"); break; case V4L2_CTRL_TYPE_H264_DECODE_PARAMS: pr_cont("H264_DECODE_PARAMS"); break; case V4L2_CTRL_TYPE_H264_PRED_WEIGHTS: pr_cont("H264_PRED_WEIGHTS"); break; case V4L2_CTRL_TYPE_FWHT_PARAMS: pr_cont("FWHT_PARAMS"); break; case V4L2_CTRL_TYPE_VP8_FRAME: pr_cont("VP8_FRAME"); break; case V4L2_CTRL_TYPE_HDR10_CLL_INFO: pr_cont("HDR10_CLL_INFO"); break; case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY: pr_cont("HDR10_MASTERING_DISPLAY"); break; case V4L2_CTRL_TYPE_MPEG2_QUANTISATION: pr_cont("MPEG2_QUANTISATION"); break; case V4L2_CTRL_TYPE_MPEG2_SEQUENCE: pr_cont("MPEG2_SEQUENCE"); break; case V4L2_CTRL_TYPE_MPEG2_PICTURE: pr_cont("MPEG2_PICTURE"); break; case V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR: pr_cont("VP9_COMPRESSED_HDR"); break; case V4L2_CTRL_TYPE_VP9_FRAME: pr_cont("VP9_FRAME"); break; case V4L2_CTRL_TYPE_HEVC_SPS: pr_cont("HEVC_SPS"); break; case V4L2_CTRL_TYPE_HEVC_PPS: pr_cont("HEVC_PPS"); break; case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: pr_cont("HEVC_SLICE_PARAMS"); break; case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX: pr_cont("HEVC_SCALING_MATRIX"); break; case V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS: pr_cont("HEVC_DECODE_PARAMS"); break; case V4L2_CTRL_TYPE_AV1_SEQUENCE: pr_cont("AV1_SEQUENCE"); break; case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: pr_cont("AV1_TILE_GROUP_ENTRY"); break; case V4L2_CTRL_TYPE_AV1_FRAME: pr_cont("AV1_FRAME"); break; case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: pr_cont("AV1_FILM_GRAIN"); break; default: pr_cont("unknown type %d", ctrl->type); break; } } EXPORT_SYMBOL(v4l2_ctrl_type_op_log); /* * Round towards the closest legal value. Be careful when we are * close to the maximum range of the control type to prevent * wrap-arounds. */ #define ROUND_TO_RANGE(val, offset_type, ctrl) \ ({ \ offset_type offset; \ if ((ctrl)->maximum >= 0 && \ val >= (ctrl)->maximum - (s32)((ctrl)->step / 2)) \ val = (ctrl)->maximum; \ else \ val += (s32)((ctrl)->step / 2); \ val = clamp_t(typeof(val), val, \ (ctrl)->minimum, (ctrl)->maximum); \ offset = (val) - (ctrl)->minimum; \ offset = (ctrl)->step * (offset / (u32)(ctrl)->step); \ val = (ctrl)->minimum + offset; \ 0; \ }) /* Validate a new control */ #define zero_padding(s) \ memset(&(s).padding, 0, sizeof((s).padding)) #define zero_reserved(s) \ memset(&(s).reserved, 0, sizeof((s).reserved)) static int validate_vp9_lf_params(struct v4l2_vp9_loop_filter *lf) { unsigned int i; if (lf->flags & ~(V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED | V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE)) return -EINVAL; /* That all values are in the accepted range. */ if (lf->level > GENMASK(5, 0)) return -EINVAL; if (lf->sharpness > GENMASK(2, 0)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(lf->ref_deltas); i++) if (lf->ref_deltas[i] < -63 || lf->ref_deltas[i] > 63) return -EINVAL; for (i = 0; i < ARRAY_SIZE(lf->mode_deltas); i++) if (lf->mode_deltas[i] < -63 || lf->mode_deltas[i] > 63) return -EINVAL; zero_reserved(*lf); return 0; } static int validate_vp9_quant_params(struct v4l2_vp9_quantization *quant) { if (quant->delta_q_y_dc < -15 || quant->delta_q_y_dc > 15 || quant->delta_q_uv_dc < -15 || quant->delta_q_uv_dc > 15 || quant->delta_q_uv_ac < -15 || quant->delta_q_uv_ac > 15) return -EINVAL; zero_reserved(*quant); return 0; } static int validate_vp9_seg_params(struct v4l2_vp9_segmentation *seg) { unsigned int i, j; if (seg->flags & ~(V4L2_VP9_SEGMENTATION_FLAG_ENABLED | V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP | V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE | V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA | V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(seg->feature_enabled); i++) { if (seg->feature_enabled[i] & ~V4L2_VP9_SEGMENT_FEATURE_ENABLED_MASK) return -EINVAL; } for (i = 0; i < ARRAY_SIZE(seg->feature_data); i++) { static const int range[] = { 255, 63, 3, 0 }; for (j = 0; j < ARRAY_SIZE(seg->feature_data[j]); j++) { if (seg->feature_data[i][j] < -range[j] || seg->feature_data[i][j] > range[j]) return -EINVAL; } } zero_reserved(*seg); return 0; } static int validate_vp9_compressed_hdr(struct v4l2_ctrl_vp9_compressed_hdr *hdr) { if (hdr->tx_mode > V4L2_VP9_TX_MODE_SELECT) return -EINVAL; return 0; } static int validate_vp9_frame(struct v4l2_ctrl_vp9_frame *frame) { int ret; /* Make sure we're not passed invalid flags. */ if (frame->flags & ~(V4L2_VP9_FRAME_FLAG_KEY_FRAME | V4L2_VP9_FRAME_FLAG_SHOW_FRAME | V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT | V4L2_VP9_FRAME_FLAG_INTRA_ONLY | V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV | V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX | V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE | V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING | V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING | V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING)) return -EINVAL; if (frame->flags & V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT && frame->flags & V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX) return -EINVAL; if (frame->profile > V4L2_VP9_PROFILE_MAX) return -EINVAL; if (frame->reset_frame_context > V4L2_VP9_RESET_FRAME_CTX_ALL) return -EINVAL; if (frame->frame_context_idx >= V4L2_VP9_NUM_FRAME_CTX) return -EINVAL; /* * Profiles 0 and 1 only support 8-bit depth, profiles 2 and 3 only 10 * and 12 bit depths. */ if ((frame->profile < 2 && frame->bit_depth != 8) || (frame->profile >= 2 && (frame->bit_depth != 10 && frame->bit_depth != 12))) return -EINVAL; /* Profile 0 and 2 only accept YUV 4:2:0. */ if ((frame->profile == 0 || frame->profile == 2) && (!(frame->flags & V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING) || !(frame->flags & V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING))) return -EINVAL; /* Profile 1 and 3 only accept YUV 4:2:2, 4:4:0 and 4:4:4. */ if ((frame->profile == 1 || frame->profile == 3) && ((frame->flags & V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING) && (frame->flags & V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING))) return -EINVAL; if (frame->interpolation_filter > V4L2_VP9_INTERP_FILTER_SWITCHABLE) return -EINVAL; /* * According to the spec, tile_cols_log2 shall be less than or equal * to 6. */ if (frame->tile_cols_log2 > 6) return -EINVAL; if (frame->reference_mode > V4L2_VP9_REFERENCE_MODE_SELECT) return -EINVAL; ret = validate_vp9_lf_params(&frame->lf); if (ret) return ret; ret = validate_vp9_quant_params(&frame->quant); if (ret) return ret; ret = validate_vp9_seg_params(&frame->seg); if (ret) return ret; zero_reserved(*frame); return 0; } static int validate_av1_quantization(struct v4l2_av1_quantization *q) { if (q->flags > GENMASK(2, 0)) return -EINVAL; if (q->delta_q_y_dc < -64 || q->delta_q_y_dc > 63 || q->delta_q_u_dc < -64 || q->delta_q_u_dc > 63 || q->delta_q_v_dc < -64 || q->delta_q_v_dc > 63 || q->delta_q_u_ac < -64 || q->delta_q_u_ac > 63 || q->delta_q_v_ac < -64 || q->delta_q_v_ac > 63 || q->delta_q_res > GENMASK(1, 0)) return -EINVAL; if (q->qm_y > GENMASK(3, 0) || q->qm_u > GENMASK(3, 0) || q->qm_v > GENMASK(3, 0)) return -EINVAL; return 0; } static int validate_av1_segmentation(struct v4l2_av1_segmentation *s) { u32 i; u32 j; if (s->flags > GENMASK(4, 0)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(s->feature_data); i++) { static const int segmentation_feature_signed[] = { 1, 1, 1, 1, 1, 0, 0, 0 }; static const int segmentation_feature_max[] = { 255, 63, 63, 63, 63, 7, 0, 0}; for (j = 0; j < ARRAY_SIZE(s->feature_data[j]); j++) { s32 limit = segmentation_feature_max[j]; if (segmentation_feature_signed[j]) { if (s->feature_data[i][j] < -limit || s->feature_data[i][j] > limit) return -EINVAL; } else { if (s->feature_data[i][j] < 0 || s->feature_data[i][j] > limit) return -EINVAL; } } } return 0; } static int validate_av1_loop_filter(struct v4l2_av1_loop_filter *lf) { u32 i; if (lf->flags > GENMASK(3, 0)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(lf->level); i++) { if (lf->level[i] > GENMASK(5, 0)) return -EINVAL; } if (lf->sharpness > GENMASK(2, 0)) return -EINVAL; for (i = 0; i < ARRAY_SIZE(lf->ref_deltas); i++) { if (lf->ref_deltas[i] < -64 || lf->ref_deltas[i] > 63) return -EINVAL; } for (i = 0; i < ARRAY_SIZE(lf->mode_deltas); i++) { if (lf->mode_deltas[i] < -64 || lf->mode_deltas[i] > 63) return -EINVAL; } return 0; } static int validate_av1_cdef(struct v4l2_av1_cdef *cdef) { u32 i; if (cdef->damping_minus_3 > GENMASK(1, 0) || cdef->bits > GENMASK(1, 0)) return -EINVAL; for (i = 0; i < 1 << cdef->bits; i++) { if (cdef->y_pri_strength[i] > GENMASK(3, 0) || cdef->y_sec_strength[i] > 4 || cdef->uv_pri_strength[i] > GENMASK(3, 0) || cdef->uv_sec_strength[i] > 4) return -EINVAL; } return 0; } static int validate_av1_loop_restauration(struct v4l2_av1_loop_restoration *lr) { if (lr->lr_unit_shift > 3 || lr->lr_uv_shift > 1) return -EINVAL; return 0; } static int validate_av1_film_grain(struct v4l2_ctrl_av1_film_grain *fg) { u32 i; if (fg->flags > GENMASK(4, 0)) return -EINVAL; if (fg->film_grain_params_ref_idx > GENMASK(2, 0) || fg->num_y_points > 14 || fg->num_cb_points > 10 || fg->num_cr_points > GENMASK(3, 0) || fg->grain_scaling_minus_8 > GENMASK(1, 0) || fg->ar_coeff_lag > GENMASK(1, 0) || fg->ar_coeff_shift_minus_6 > GENMASK(1, 0) || fg->grain_scale_shift > GENMASK(1, 0)) return -EINVAL; if (!(fg->flags & V4L2_AV1_FILM_GRAIN_FLAG_APPLY_GRAIN)) return 0; for (i = 1; i < fg->num_y_points; i++) if (fg->point_y_value[i] <= fg->point_y_value[i - 1]) return -EINVAL; for (i = 1; i < fg->num_cb_points; i++) if (fg->point_cb_value[i] <= fg->point_cb_value[i - 1]) return -EINVAL; for (i = 1; i < fg->num_cr_points; i++) if (fg->point_cr_value[i] <= fg->point_cr_value[i - 1]) return -EINVAL; return 0; } static int validate_av1_frame(struct v4l2_ctrl_av1_frame *f) { int ret = 0; ret = validate_av1_quantization(&f->quantization); if (ret) return ret; ret = validate_av1_segmentation(&f->segmentation); if (ret) return ret; ret = validate_av1_loop_filter(&f->loop_filter); if (ret) return ret; ret = validate_av1_cdef(&f->cdef); if (ret) return ret; ret = validate_av1_loop_restauration(&f->loop_restoration); if (ret) return ret; if (f->flags & ~(V4L2_AV1_FRAME_FLAG_SHOW_FRAME | V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME | V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE | V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE | V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS | V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV | V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC | V4L2_AV1_FRAME_FLAG_USE_SUPERRES | V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV | V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE | V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS | V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF | V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION | V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT | V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET | V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED | V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT | V4L2_AV1_FRAME_FLAG_FRAME_SIZE_OVERRIDE | V4L2_AV1_FRAME_FLAG_BUFFER_REMOVAL_TIME_PRESENT | V4L2_AV1_FRAME_FLAG_FRAME_REFS_SHORT_SIGNALING)) return -EINVAL; if (f->superres_denom > GENMASK(2, 0) + 9) return -EINVAL; return 0; } static int validate_av1_sequence(struct v4l2_ctrl_av1_sequence *s) { if (s->flags & ~(V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE | V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK | V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA | V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER | V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND | V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND | V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION | V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER | V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT | V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP | V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS | V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES | V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF | V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION | V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME | V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE | V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X | V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y | V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT | V4L2_AV1_SEQUENCE_FLAG_SEPARATE_UV_DELTA_Q)) return -EINVAL; if (s->seq_profile == 1 && s->flags & V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME) return -EINVAL; /* reserved */ if (s->seq_profile > 2) return -EINVAL; /* TODO: PROFILES */ return 0; } /* * Compound controls validation requires setting unused fields/flags to zero * in order to properly detect unchanged controls with v4l2_ctrl_type_op_equal's * memcmp. */ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr) { struct v4l2_ctrl_mpeg2_sequence *p_mpeg2_sequence; struct v4l2_ctrl_mpeg2_picture *p_mpeg2_picture; struct v4l2_ctrl_vp8_frame *p_vp8_frame; struct v4l2_ctrl_fwht_params *p_fwht_params; struct v4l2_ctrl_h264_sps *p_h264_sps; struct v4l2_ctrl_h264_pps *p_h264_pps; struct v4l2_ctrl_h264_pred_weights *p_h264_pred_weights; struct v4l2_ctrl_h264_slice_params *p_h264_slice_params; struct v4l2_ctrl_h264_decode_params *p_h264_dec_params; struct v4l2_ctrl_hevc_sps *p_hevc_sps; struct v4l2_ctrl_hevc_pps *p_hevc_pps; struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering; struct v4l2_ctrl_hevc_decode_params *p_hevc_decode_params; struct v4l2_area *area; void *p = ptr.p + idx * ctrl->elem_size; unsigned int i; switch ((u32)ctrl->type) { case V4L2_CTRL_TYPE_MPEG2_SEQUENCE: p_mpeg2_sequence = p; switch (p_mpeg2_sequence->chroma_format) { case 1: /* 4:2:0 */ case 2: /* 4:2:2 */ case 3: /* 4:4:4 */ break; default: return -EINVAL; } break; case V4L2_CTRL_TYPE_MPEG2_PICTURE: p_mpeg2_picture = p; switch (p_mpeg2_picture->intra_dc_precision) { case 0: /* 8 bits */ case 1: /* 9 bits */ case 2: /* 10 bits */ case 3: /* 11 bits */ break; default: return -EINVAL; } switch (p_mpeg2_picture->picture_structure) { case V4L2_MPEG2_PIC_TOP_FIELD: case V4L2_MPEG2_PIC_BOTTOM_FIELD: case V4L2_MPEG2_PIC_FRAME: break; default: return -EINVAL; } switch (p_mpeg2_picture->picture_coding_type) { case V4L2_MPEG2_PIC_CODING_TYPE_I: case V4L2_MPEG2_PIC_CODING_TYPE_P: case V4L2_MPEG2_PIC_CODING_TYPE_B: break; default: return -EINVAL; } zero_reserved(*p_mpeg2_picture); break; case V4L2_CTRL_TYPE_MPEG2_QUANTISATION: break; case V4L2_CTRL_TYPE_FWHT_PARAMS: p_fwht_params = p; if (p_fwht_params->version < V4L2_FWHT_VERSION) return -EINVAL; if (!p_fwht_params->width || !p_fwht_params->height) return -EINVAL; break; case V4L2_CTRL_TYPE_H264_SPS: p_h264_sps = p; /* Some syntax elements are only conditionally valid */ if (p_h264_sps->pic_order_cnt_type != 0) { p_h264_sps->log2_max_pic_order_cnt_lsb_minus4 = 0; } else if (p_h264_sps->pic_order_cnt_type != 1) { p_h264_sps->num_ref_frames_in_pic_order_cnt_cycle = 0; p_h264_sps->offset_for_non_ref_pic = 0; p_h264_sps->offset_for_top_to_bottom_field = 0; memset(&p_h264_sps->offset_for_ref_frame, 0, sizeof(p_h264_sps->offset_for_ref_frame)); } if (!V4L2_H264_SPS_HAS_CHROMA_FORMAT(p_h264_sps)) { p_h264_sps->chroma_format_idc = 1; p_h264_sps->bit_depth_luma_minus8 = 0; p_h264_sps->bit_depth_chroma_minus8 = 0; p_h264_sps->flags &= ~V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS; if (p_h264_sps->chroma_format_idc < 3) p_h264_sps->flags &= ~V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE; } if (p_h264_sps->flags & V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY) p_h264_sps->flags &= ~V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD; /* * Chroma 4:2:2 format require at least High 4:2:2 profile. * * The H264 specification and well-known parser implementations * use profile-idc values directly, as that is clearer and * less ambiguous. We do the same here. */ if (p_h264_sps->profile_idc < 122 && p_h264_sps->chroma_format_idc > 1) return -EINVAL; /* Chroma 4:4:4 format require at least High 4:2:2 profile */ if (p_h264_sps->profile_idc < 244 && p_h264_sps->chroma_format_idc > 2) return -EINVAL; if (p_h264_sps->chroma_format_idc > 3) return -EINVAL; if (p_h264_sps->bit_depth_luma_minus8 > 6) return -EINVAL; if (p_h264_sps->bit_depth_chroma_minus8 > 6) return -EINVAL; if (p_h264_sps->log2_max_frame_num_minus4 > 12) return -EINVAL; if (p_h264_sps->pic_order_cnt_type > 2) return -EINVAL; if (p_h264_sps->log2_max_pic_order_cnt_lsb_minus4 > 12) return -EINVAL; if (p_h264_sps->max_num_ref_frames > V4L2_H264_REF_LIST_LEN) return -EINVAL; break; case V4L2_CTRL_TYPE_H264_PPS: p_h264_pps = p; if (p_h264_pps->num_slice_groups_minus1 > 7) return -EINVAL; if (p_h264_pps->num_ref_idx_l0_default_active_minus1 > (V4L2_H264_REF_LIST_LEN - 1)) return -EINVAL; if (p_h264_pps->num_ref_idx_l1_default_active_minus1 > (V4L2_H264_REF_LIST_LEN - 1)) return -EINVAL; if (p_h264_pps->weighted_bipred_idc > 2) return -EINVAL; /* * pic_init_qp_minus26 shall be in the range of * -(26 + QpBdOffset_y) to +25, inclusive, * where QpBdOffset_y is 6 * bit_depth_luma_minus8 */ if (p_h264_pps->pic_init_qp_minus26 < -62 || p_h264_pps->pic_init_qp_minus26 > 25) return -EINVAL; if (p_h264_pps->pic_init_qs_minus26 < -26 || p_h264_pps->pic_init_qs_minus26 > 25) return -EINVAL; if (p_h264_pps->chroma_qp_index_offset < -12 || p_h264_pps->chroma_qp_index_offset > 12) return -EINVAL; if (p_h264_pps->second_chroma_qp_index_offset < -12 || p_h264_pps->second_chroma_qp_index_offset > 12) return -EINVAL; break; case V4L2_CTRL_TYPE_H264_SCALING_MATRIX: break; case V4L2_CTRL_TYPE_H264_PRED_WEIGHTS: p_h264_pred_weights = p; if (p_h264_pred_weights->luma_log2_weight_denom > 7) return -EINVAL; if (p_h264_pred_weights->chroma_log2_weight_denom > 7) return -EINVAL; break; case V4L2_CTRL_TYPE_H264_SLICE_PARAMS: p_h264_slice_params = p; if (p_h264_slice_params->slice_type != V4L2_H264_SLICE_TYPE_B) p_h264_slice_params->flags &= ~V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED; if (p_h264_slice_params->colour_plane_id > 2) return -EINVAL; if (p_h264_slice_params->cabac_init_idc > 2) return -EINVAL; if (p_h264_slice_params->disable_deblocking_filter_idc > 2) return -EINVAL; if (p_h264_slice_params->slice_alpha_c0_offset_div2 < -6 || p_h264_slice_params->slice_alpha_c0_offset_div2 > 6) return -EINVAL; if (p_h264_slice_params->slice_beta_offset_div2 < -6 || p_h264_slice_params->slice_beta_offset_div2 > 6) return -EINVAL; if (p_h264_slice_params->slice_type == V4L2_H264_SLICE_TYPE_I || p_h264_slice_params->slice_type == V4L2_H264_SLICE_TYPE_SI) p_h264_slice_params->num_ref_idx_l0_active_minus1 = 0; if (p_h264_slice_params->slice_type != V4L2_H264_SLICE_TYPE_B) p_h264_slice_params->num_ref_idx_l1_active_minus1 = 0; if (p_h264_slice_params->num_ref_idx_l0_active_minus1 > (V4L2_H264_REF_LIST_LEN - 1)) return -EINVAL; if (p_h264_slice_params->num_ref_idx_l1_active_minus1 > (V4L2_H264_REF_LIST_LEN - 1)) return -EINVAL; zero_reserved(*p_h264_slice_params); break; case V4L2_CTRL_TYPE_H264_DECODE_PARAMS: p_h264_dec_params = p; if (p_h264_dec_params->nal_ref_idc > 3) return -EINVAL; for (i = 0; i < V4L2_H264_NUM_DPB_ENTRIES; i++) { struct v4l2_h264_dpb_entry *dpb_entry = &p_h264_dec_params->dpb[i]; zero_reserved(*dpb_entry); } zero_reserved(*p_h264_dec_params); break; case V4L2_CTRL_TYPE_VP8_FRAME: p_vp8_frame = p; switch (p_vp8_frame->num_dct_parts) { case 1: case 2: case 4: case 8: break; default: return -EINVAL; } zero_padding(p_vp8_frame->segment); zero_padding(p_vp8_frame->lf); zero_padding(p_vp8_frame->quant); zero_padding(p_vp8_frame->entropy); zero_padding(p_vp8_frame->coder_state); break; case V4L2_CTRL_TYPE_HEVC_SPS: p_hevc_sps = p; if (!(p_hevc_sps->flags & V4L2_HEVC_SPS_FLAG_PCM_ENABLED)) { p_hevc_sps->pcm_sample_bit_depth_luma_minus1 = 0; p_hevc_sps->pcm_sample_bit_depth_chroma_minus1 = 0; p_hevc_sps->log2_min_pcm_luma_coding_block_size_minus3 = 0; p_hevc_sps->log2_diff_max_min_pcm_luma_coding_block_size = 0; } if (!(p_hevc_sps->flags & V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT)) p_hevc_sps->num_long_term_ref_pics_sps = 0; break; case V4L2_CTRL_TYPE_HEVC_PPS: p_hevc_pps = p; if (!(p_hevc_pps->flags & V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED)) p_hevc_pps->diff_cu_qp_delta_depth = 0; if (!(p_hevc_pps->flags & V4L2_HEVC_PPS_FLAG_TILES_ENABLED)) { p_hevc_pps->num_tile_columns_minus1 = 0; p_hevc_pps->num_tile_rows_minus1 = 0; memset(&p_hevc_pps->column_width_minus1, 0, sizeof(p_hevc_pps->column_width_minus1)); memset(&p_hevc_pps->row_height_minus1, 0, sizeof(p_hevc_pps->row_height_minus1)); p_hevc_pps->flags &= ~V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; } if (p_hevc_pps->flags & V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER) { p_hevc_pps->pps_beta_offset_div2 = 0; p_hevc_pps->pps_tc_offset_div2 = 0; } break; case V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS: p_hevc_decode_params = p; if (p_hevc_decode_params->num_active_dpb_entries > V4L2_HEVC_DPB_ENTRIES_NUM_MAX) return -EINVAL; break; case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: break; case V4L2_CTRL_TYPE_HDR10_CLL_INFO: break; case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY: p_hdr10_mastering = p; for (i = 0; i < 3; ++i) { if (p_hdr10_mastering->display_primaries_x[i] < V4L2_HDR10_MASTERING_PRIMARIES_X_LOW || p_hdr10_mastering->display_primaries_x[i] > V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH || p_hdr10_mastering->display_primaries_y[i] < V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW || p_hdr10_mastering->display_primaries_y[i] > V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH) return -EINVAL; } if (p_hdr10_mastering->white_point_x < V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW || p_hdr10_mastering->white_point_x > V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH || p_hdr10_mastering->white_point_y < V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW || p_hdr10_mastering->white_point_y > V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH) return -EINVAL; if (p_hdr10_mastering->max_display_mastering_luminance < V4L2_HDR10_MASTERING_MAX_LUMA_LOW || p_hdr10_mastering->max_display_mastering_luminance > V4L2_HDR10_MASTERING_MAX_LUMA_HIGH || p_hdr10_mastering->min_display_mastering_luminance < V4L2_HDR10_MASTERING_MIN_LUMA_LOW || p_hdr10_mastering->min_display_mastering_luminance > V4L2_HDR10_MASTERING_MIN_LUMA_HIGH) return -EINVAL; /* The following restriction comes from ITU-T Rec. H.265 spec */ if (p_hdr10_mastering->max_display_mastering_luminance == V4L2_HDR10_MASTERING_MAX_LUMA_LOW && p_hdr10_mastering->min_display_mastering_luminance == V4L2_HDR10_MASTERING_MIN_LUMA_HIGH) return -EINVAL; break; case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX: break; case V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR: return validate_vp9_compressed_hdr(p); case V4L2_CTRL_TYPE_VP9_FRAME: return validate_vp9_frame(p); case V4L2_CTRL_TYPE_AV1_FRAME: return validate_av1_frame(p); case V4L2_CTRL_TYPE_AV1_SEQUENCE: return validate_av1_sequence(p); case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: break; case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: return validate_av1_film_grain(p); case V4L2_CTRL_TYPE_AREA: area = p; if (!area->width || !area->height) return -EINVAL; break; default: return -EINVAL; } return 0; } static int std_validate_elem(const struct v4l2_ctrl *ctrl, u32 idx, union v4l2_ctrl_ptr ptr) { size_t len; u64 offset; s64 val; switch ((u32)ctrl->type) { case V4L2_CTRL_TYPE_INTEGER: return ROUND_TO_RANGE(ptr.p_s32[idx], u32, ctrl); case V4L2_CTRL_TYPE_INTEGER64: /* * We can't use the ROUND_TO_RANGE define here due to * the u64 divide that needs special care. */ val = ptr.p_s64[idx]; if (ctrl->maximum >= 0 && val >= ctrl->maximum - (s64)(ctrl->step / 2)) val = ctrl->maximum; else val += (s64)(ctrl->step / 2); val = clamp_t(s64, val, ctrl->minimum, ctrl->maximum); offset = val - ctrl->minimum; do_div(offset, ctrl->step); ptr.p_s64[idx] = ctrl->minimum + offset * ctrl->step; return 0; case V4L2_CTRL_TYPE_U8: return ROUND_TO_RANGE(ptr.p_u8[idx], u8, ctrl); case V4L2_CTRL_TYPE_U16: return ROUND_TO_RANGE(ptr.p_u16[idx], u16, ctrl); case V4L2_CTRL_TYPE_U32: return ROUND_TO_RANGE(ptr.p_u32[idx], u32, ctrl); case V4L2_CTRL_TYPE_BOOLEAN: ptr.p_s32[idx] = !!ptr.p_s32[idx]; return 0; case V4L2_CTRL_TYPE_MENU: case V4L2_CTRL_TYPE_INTEGER_MENU: if (ptr.p_s32[idx] < ctrl->minimum || ptr.p_s32[idx] > ctrl->maximum) return -ERANGE; if (ptr.p_s32[idx] < BITS_PER_LONG_LONG && (ctrl->menu_skip_mask & BIT_ULL(ptr.p_s32[idx]))) return -EINVAL; if (ctrl->type == V4L2_CTRL_TYPE_MENU && ctrl->qmenu[ptr.p_s32[idx]][0] == '\0') return -EINVAL; return 0; case V4L2_CTRL_TYPE_BITMASK: ptr.p_s32[idx] &= ctrl->maximum; return 0; case V4L2_CTRL_TYPE_BUTTON: case V4L2_CTRL_TYPE_CTRL_CLASS: ptr.p_s32[idx] = 0; return 0; case V4L2_CTRL_TYPE_STRING: idx *= ctrl->elem_size; len = strlen(ptr.p_char + idx); if (len < ctrl->minimum) return -ERANGE; if ((len - (u32)ctrl->minimum) % (u32)ctrl->step) return -ERANGE; return 0; default: return std_validate_compound(ctrl, idx, ptr); } } int v4l2_ctrl_type_op_validate(const struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr ptr) { unsigned int i; int ret = 0; switch ((u32)ctrl->type) { case V4L2_CTRL_TYPE_U8: if (ctrl->maximum == 0xff && ctrl->minimum == 0 && ctrl->step == 1) return 0; break; case V4L2_CTRL_TYPE_U16: if (ctrl->maximum == 0xffff && ctrl->minimum == 0 && ctrl->step == 1) return 0; break; case V4L2_CTRL_TYPE_U32: if (ctrl->maximum == 0xffffffff && ctrl->minimum == 0 && ctrl->step == 1) return 0; break; case V4L2_CTRL_TYPE_BUTTON: case V4L2_CTRL_TYPE_CTRL_CLASS: memset(ptr.p_s32, 0, ctrl->new_elems * sizeof(s32)); return 0; } for (i = 0; !ret && i < ctrl->new_elems; i++) ret = std_validate_elem(ctrl, i, ptr); return ret; } EXPORT_SYMBOL(v4l2_ctrl_type_op_validate); static const struct v4l2_ctrl_type_ops std_type_ops = { .equal = v4l2_ctrl_type_op_equal, .init = v4l2_ctrl_type_op_init, .log = v4l2_ctrl_type_op_log, .validate = v4l2_ctrl_type_op_validate, }; void v4l2_ctrl_notify(struct v4l2_ctrl *ctrl, v4l2_ctrl_notify_fnc notify, void *priv) { if (!ctrl) return; if (!notify) { ctrl->call_notify = 0; return; } if (WARN_ON(ctrl->handler->notify && ctrl->handler->notify != notify)) return; ctrl->handler->notify = notify; ctrl->handler->notify_priv = priv; ctrl->call_notify = 1; } EXPORT_SYMBOL(v4l2_ctrl_notify); /* Copy the one value to another. */ static void ptr_to_ptr(struct v4l2_ctrl *ctrl, union v4l2_ctrl_ptr from, union v4l2_ctrl_ptr to, unsigned int elems) { if (ctrl == NULL) return; memcpy(to.p, from.p_const, elems * ctrl->elem_size); } /* Copy the new value to the current value. */ void new_to_cur(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl, u32 ch_flags) { bool changed; if (ctrl == NULL) return; /* has_changed is set by cluster_changed */ changed = ctrl->has_changed; if (changed) { if (ctrl->is_dyn_array) ctrl->elems = ctrl->new_elems; ptr_to_ptr(ctrl, ctrl->p_new, ctrl->p_cur, ctrl->elems); } if (ch_flags & V4L2_EVENT_CTRL_CH_FLAGS) { /* Note: CH_FLAGS is only set for auto clusters. */ ctrl->flags &= ~(V4L2_CTRL_FLAG_INACTIVE | V4L2_CTRL_FLAG_VOLATILE); if (!is_cur_manual(ctrl->cluster[0])) { ctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; if (ctrl->cluster[0]->has_volatiles) ctrl->flags |= V4L2_CTRL_FLAG_VOLATILE; } fh = NULL; } if (changed || ch_flags) { /* If a control was changed that was not one of the controls modified by the application, then send the event to all. */ if (!ctrl->is_new) fh = NULL; send_event(fh, ctrl, (changed ? V4L2_EVENT_CTRL_CH_VALUE : 0) | ch_flags); if (ctrl->call_notify && changed && ctrl->handler->notify) ctrl->handler->notify(ctrl, ctrl->handler->notify_priv); } } /* Copy the current value to the new value */ void cur_to_new(struct v4l2_ctrl *ctrl) { if (ctrl == NULL) return; if (ctrl->is_dyn_array) ctrl->new_elems = ctrl->elems; ptr_to_ptr(ctrl, ctrl->p_cur, ctrl->p_new, ctrl->new_elems); } static bool req_alloc_array(struct v4l2_ctrl_ref *ref, u32 elems) { void *tmp; if (elems == ref->p_req_array_alloc_elems) return true; if (ref->ctrl->is_dyn_array && elems < ref->p_req_array_alloc_elems) return true; tmp = kvmalloc(elems * ref->ctrl->elem_size, GFP_KERNEL); if (!tmp) { ref->p_req_array_enomem = true; return false; } ref->p_req_array_enomem = false; kvfree(ref->p_req.p); ref->p_req.p = tmp; ref->p_req_array_alloc_elems = elems; return true; } /* Copy the new value to the request value */ void new_to_req(struct v4l2_ctrl_ref *ref) { struct v4l2_ctrl *ctrl; if (!ref) return; ctrl = ref->ctrl; if (ctrl->is_array && !req_alloc_array(ref, ctrl->new_elems)) return; ref->p_req_elems = ctrl->new_elems; ptr_to_ptr(ctrl, ctrl->p_new, ref->p_req, ref->p_req_elems); ref->p_req_valid = true; } /* Copy the current value to the request value */ void cur_to_req(struct v4l2_ctrl_ref *ref) { struct v4l2_ctrl *ctrl; if (!ref) return; ctrl = ref->ctrl; if (ctrl->is_array && !req_alloc_array(ref, ctrl->elems)) return; ref->p_req_elems = ctrl->elems; ptr_to_ptr(ctrl, ctrl->p_cur, ref->p_req, ctrl->elems); ref->p_req_valid = true; } /* Copy the request value to the new value */ int req_to_new(struct v4l2_ctrl_ref *ref) { struct v4l2_ctrl *ctrl; if (!ref) return 0; ctrl = ref->ctrl; /* * This control was never set in the request, so just use the current * value. */ if (!ref->p_req_valid) { if (ctrl->is_dyn_array) ctrl->new_elems = ctrl->elems; ptr_to_ptr(ctrl, ctrl->p_cur, ctrl->p_new, ctrl->new_elems); return 0; } /* Not an array, so just copy the request value */ if (!ctrl->is_array) { ptr_to_ptr(ctrl, ref->p_req, ctrl->p_new, ctrl->new_elems); return 0; } /* Sanity check, should never happen */ if (WARN_ON(!ref->p_req_array_alloc_elems)) return -ENOMEM; if (!ctrl->is_dyn_array && ref->p_req_elems != ctrl->p_array_alloc_elems) return -ENOMEM; /* * Check if the number of elements in the request is more than the * elements in ctrl->p_array. If so, attempt to realloc ctrl->p_array. * Note that p_array is allocated with twice the number of elements * in the dynamic array since it has to store both the current and * new value of such a control. */ if (ref->p_req_elems > ctrl->p_array_alloc_elems) { unsigned int sz = ref->p_req_elems * ctrl->elem_size; void *old = ctrl->p_array; void *tmp = kvzalloc(2 * sz, GFP_KERNEL); if (!tmp) return -ENOMEM; memcpy(tmp, ctrl->p_new.p, ctrl->elems * ctrl->elem_size); memcpy(tmp + sz, ctrl->p_cur.p, ctrl->elems * ctrl->elem_size); ctrl->p_new.p = tmp; ctrl->p_cur.p = tmp + sz; ctrl->p_array = tmp; ctrl->p_array_alloc_elems = ref->p_req_elems; kvfree(old); } ctrl->new_elems = ref->p_req_elems; ptr_to_ptr(ctrl, ref->p_req, ctrl->p_new, ctrl->new_elems); return 0; } /* Control range checking */ int check_range(enum v4l2_ctrl_type type, s64 min, s64 max, u64 step, s64 def) { switch (type) { case V4L2_CTRL_TYPE_BOOLEAN: if (step != 1 || max > 1 || min < 0) return -ERANGE; fallthrough; case V4L2_CTRL_TYPE_U8: case V4L2_CTRL_TYPE_U16: case V4L2_CTRL_TYPE_U32: case V4L2_CTRL_TYPE_INTEGER: case V4L2_CTRL_TYPE_INTEGER64: if (step == 0 || min > max || def < min || def > max) return -ERANGE; return 0; case V4L2_CTRL_TYPE_BITMASK: if (step || min || !max || (def & ~max)) return -ERANGE; return 0; case V4L2_CTRL_TYPE_MENU: case V4L2_CTRL_TYPE_INTEGER_MENU: if (min > max || def < min || def > max || min < 0 || (step && max >= BITS_PER_LONG_LONG)) return -ERANGE; /* Note: step == menu_skip_mask for menu controls. So here we check if the default value is masked out. */ if (def < BITS_PER_LONG_LONG && (step & BIT_ULL(def))) return -EINVAL; return 0; case V4L2_CTRL_TYPE_STRING: if (min > max || min < 0 || step < 1 || def) return -ERANGE; return 0; default: return 0; } } /* Set the handler's error code if it wasn't set earlier already */ static inline int handler_set_err(struct v4l2_ctrl_handler *hdl, int err) { if (hdl->error == 0) hdl->error = err; return err; } /* Initialize the handler */ int v4l2_ctrl_handler_init_class(struct v4l2_ctrl_handler *hdl, unsigned nr_of_controls_hint, struct lock_class_key *key, const char *name) { mutex_init(&hdl->_lock); hdl->lock = &hdl->_lock; lockdep_set_class_and_name(hdl->lock, key, name); INIT_LIST_HEAD(&hdl->ctrls); INIT_LIST_HEAD(&hdl->ctrl_refs); hdl->nr_of_buckets = 1 + nr_of_controls_hint / 8; hdl->buckets = kvcalloc(hdl->nr_of_buckets, sizeof(hdl->buckets[0]), GFP_KERNEL); hdl->error = hdl->buckets ? 0 : -ENOMEM; v4l2_ctrl_handler_init_request(hdl); return hdl->error; } EXPORT_SYMBOL(v4l2_ctrl_handler_init_class); /* Free all controls and control refs */ void v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl) { struct v4l2_ctrl_ref *ref, *next_ref; struct v4l2_ctrl *ctrl, *next_ctrl; struct v4l2_subscribed_event *sev, *next_sev; if (hdl == NULL || hdl->buckets == NULL) return; v4l2_ctrl_handler_free_request(hdl); mutex_lock(hdl->lock); /* Free all nodes */ list_for_each_entry_safe(ref, next_ref, &hdl->ctrl_refs, node) { list_del(&ref->node); if (ref->p_req_array_alloc_elems) kvfree(ref->p_req.p); kfree(ref); } /* Free all controls owned by the handler */ list_for_each_entry_safe(ctrl, next_ctrl, &hdl->ctrls, node) { list_del(&ctrl->node); list_for_each_entry_safe(sev, next_sev, &ctrl->ev_subs, node) list_del(&sev->node); kvfree(ctrl->p_array); kvfree(ctrl); } kvfree(hdl->buckets); hdl->buckets = NULL; hdl->cached = NULL; hdl->error = 0; mutex_unlock(hdl->lock); mutex_destroy(&hdl->_lock); } EXPORT_SYMBOL(v4l2_ctrl_handler_free); /* For backwards compatibility: V4L2_CID_PRIVATE_BASE should no longer be used except in G_CTRL, S_CTRL, QUERYCTRL and QUERYMENU when dealing with applications that do not use the NEXT_CTRL flag. We just find the n-th private user control. It's O(N), but that should not be an issue in this particular case. */ static struct v4l2_ctrl_ref *find_private_ref( struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref; id -= V4L2_CID_PRIVATE_BASE; list_for_each_entry(ref, &hdl->ctrl_refs, node) { /* Search for private user controls that are compatible with VIDIOC_G/S_CTRL. */ if (V4L2_CTRL_ID2WHICH(ref->ctrl->id) == V4L2_CTRL_CLASS_USER && V4L2_CTRL_DRIVER_PRIV(ref->ctrl->id)) { if (!ref->ctrl->is_int) continue; if (id == 0) return ref; id--; } } return NULL; } /* Find a control with the given ID. */ struct v4l2_ctrl_ref *find_ref(struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref; int bucket; id &= V4L2_CTRL_ID_MASK; /* Old-style private controls need special handling */ if (id >= V4L2_CID_PRIVATE_BASE) return find_private_ref(hdl, id); bucket = id % hdl->nr_of_buckets; /* Simple optimization: cache the last control found */ if (hdl->cached && hdl->cached->ctrl->id == id) return hdl->cached; /* Not in cache, search the hash */ ref = hdl->buckets ? hdl->buckets[bucket] : NULL; while (ref && ref->ctrl->id != id) ref = ref->next; if (ref) hdl->cached = ref; /* cache it! */ return ref; } /* Find a control with the given ID. Take the handler's lock first. */ struct v4l2_ctrl_ref *find_ref_lock(struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref = NULL; if (hdl) { mutex_lock(hdl->lock); ref = find_ref(hdl, id); mutex_unlock(hdl->lock); } return ref; } /* Find a control with the given ID. */ struct v4l2_ctrl *v4l2_ctrl_find(struct v4l2_ctrl_handler *hdl, u32 id) { struct v4l2_ctrl_ref *ref = find_ref_lock(hdl, id); return ref ? ref->ctrl : NULL; } EXPORT_SYMBOL(v4l2_ctrl_find); /* Allocate a new v4l2_ctrl_ref and hook it into the handler. */ int handler_new_ref(struct v4l2_ctrl_handler *hdl, struct v4l2_ctrl *ctrl, struct v4l2_ctrl_ref **ctrl_ref, bool from_other_dev, bool allocate_req) { struct v4l2_ctrl_ref *ref; struct v4l2_ctrl_ref *new_ref; u32 id = ctrl->id; u32 class_ctrl = V4L2_CTRL_ID2WHICH(id) | 1; int bucket = id % hdl->nr_of_buckets; /* which bucket to use */ unsigned int size_extra_req = 0; if (ctrl_ref) *ctrl_ref = NULL; /* * Automatically add the control class if it is not yet present and * the new control is not a compound control. */ if (ctrl->type < V4L2_CTRL_COMPOUND_TYPES && id != class_ctrl && find_ref_lock(hdl, class_ctrl) == NULL) if (!v4l2_ctrl_new_std(hdl, NULL, class_ctrl, 0, 0, 0, 0)) return hdl->error; if (hdl->error) return hdl->error; if (allocate_req && !ctrl->is_array) size_extra_req = ctrl->elems * ctrl->elem_size; new_ref = kzalloc(sizeof(*new_ref) + size_extra_req, GFP_KERNEL); if (!new_ref) return handler_set_err(hdl, -ENOMEM); new_ref->ctrl = ctrl; new_ref->from_other_dev = from_other_dev; if (size_extra_req) new_ref->p_req.p = &new_ref[1]; INIT_LIST_HEAD(&new_ref->node); mutex_lock(hdl->lock); /* Add immediately at the end of the list if the list is empty, or if the last element in the list has a lower ID. This ensures that when elements are added in ascending order the insertion is an O(1) operation. */ if (list_empty(&hdl->ctrl_refs) || id > node2id(hdl->ctrl_refs.prev)) { list_add_tail(&new_ref->node, &hdl->ctrl_refs); goto insert_in_hash; } /* Find insert position in sorted list */ list_for_each_entry(ref, &hdl->ctrl_refs, node) { if (ref->ctrl->id < id) continue; /* Don't add duplicates */ if (ref->ctrl->id == id) { kfree(new_ref); goto unlock; } list_add(&new_ref->node, ref->node.prev); break; } insert_in_hash: /* Insert the control node in the hash */ new_ref->next = hdl->buckets[bucket]; hdl->buckets[bucket] = new_ref; if (ctrl_ref) *ctrl_ref = new_ref; if (ctrl->handler == hdl) { /* By default each control starts in a cluster of its own. * new_ref->ctrl is basically a cluster array with one * element, so that's perfect to use as the cluster pointer. * But only do this for the handler that owns the control. */ ctrl->cluster = &new_ref->ctrl; ctrl->ncontrols = 1; } unlock: mutex_unlock(hdl->lock); return 0; } /* Add a new control */ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, const struct v4l2_ctrl_type_ops *type_ops, u32 id, const char *name, enum v4l2_ctrl_type type, s64 min, s64 max, u64 step, s64 def, const u32 dims[V4L2_CTRL_MAX_DIMS], u32 elem_size, u32 flags, const char * const *qmenu, const s64 *qmenu_int, const union v4l2_ctrl_ptr p_def, void *priv) { struct v4l2_ctrl *ctrl; unsigned sz_extra; unsigned nr_of_dims = 0; unsigned elems = 1; bool is_array; unsigned tot_ctrl_size; void *data; int err; if (hdl->error) return NULL; while (dims && dims[nr_of_dims]) { elems *= dims[nr_of_dims]; nr_of_dims++; if (nr_of_dims == V4L2_CTRL_MAX_DIMS) break; } is_array = nr_of_dims > 0; /* Prefill elem_size for all types handled by std_type_ops */ switch ((u32)type) { case V4L2_CTRL_TYPE_INTEGER64: elem_size = sizeof(s64); break; case V4L2_CTRL_TYPE_STRING: elem_size = max + 1; break; case V4L2_CTRL_TYPE_U8: elem_size = sizeof(u8); break; case V4L2_CTRL_TYPE_U16: elem_size = sizeof(u16); break; case V4L2_CTRL_TYPE_U32: elem_size = sizeof(u32); break; case V4L2_CTRL_TYPE_MPEG2_SEQUENCE: elem_size = sizeof(struct v4l2_ctrl_mpeg2_sequence); break; case V4L2_CTRL_TYPE_MPEG2_PICTURE: elem_size = sizeof(struct v4l2_ctrl_mpeg2_picture); break; case V4L2_CTRL_TYPE_MPEG2_QUANTISATION: elem_size = sizeof(struct v4l2_ctrl_mpeg2_quantisation); break; case V4L2_CTRL_TYPE_FWHT_PARAMS: elem_size = sizeof(struct v4l2_ctrl_fwht_params); break; case V4L2_CTRL_TYPE_H264_SPS: elem_size = sizeof(struct v4l2_ctrl_h264_sps); break; case V4L2_CTRL_TYPE_H264_PPS: elem_size = sizeof(struct v4l2_ctrl_h264_pps); break; case V4L2_CTRL_TYPE_H264_SCALING_MATRIX: elem_size = sizeof(struct v4l2_ctrl_h264_scaling_matrix); break; case V4L2_CTRL_TYPE_H264_SLICE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_h264_slice_params); break; case V4L2_CTRL_TYPE_H264_DECODE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_h264_decode_params); break; case V4L2_CTRL_TYPE_H264_PRED_WEIGHTS: elem_size = sizeof(struct v4l2_ctrl_h264_pred_weights); break; case V4L2_CTRL_TYPE_VP8_FRAME: elem_size = sizeof(struct v4l2_ctrl_vp8_frame); break; case V4L2_CTRL_TYPE_HEVC_SPS: elem_size = sizeof(struct v4l2_ctrl_hevc_sps); break; case V4L2_CTRL_TYPE_HEVC_PPS: elem_size = sizeof(struct v4l2_ctrl_hevc_pps); break; case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_hevc_slice_params); break; case V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX: elem_size = sizeof(struct v4l2_ctrl_hevc_scaling_matrix); break; case V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS: elem_size = sizeof(struct v4l2_ctrl_hevc_decode_params); break; case V4L2_CTRL_TYPE_HDR10_CLL_INFO: elem_size = sizeof(struct v4l2_ctrl_hdr10_cll_info); break; case V4L2_CTRL_TYPE_HDR10_MASTERING_DISPLAY: elem_size = sizeof(struct v4l2_ctrl_hdr10_mastering_display); break; case V4L2_CTRL_TYPE_VP9_COMPRESSED_HDR: elem_size = sizeof(struct v4l2_ctrl_vp9_compressed_hdr); break; case V4L2_CTRL_TYPE_VP9_FRAME: elem_size = sizeof(struct v4l2_ctrl_vp9_frame); break; case V4L2_CTRL_TYPE_AV1_SEQUENCE: elem_size = sizeof(struct v4l2_ctrl_av1_sequence); break; case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: elem_size = sizeof(struct v4l2_ctrl_av1_tile_group_entry); break; case V4L2_CTRL_TYPE_AV1_FRAME: elem_size = sizeof(struct v4l2_ctrl_av1_frame); break; case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: elem_size = sizeof(struct v4l2_ctrl_av1_film_grain); break; case V4L2_CTRL_TYPE_AREA: elem_size = sizeof(struct v4l2_area); break; default: if (type < V4L2_CTRL_COMPOUND_TYPES) elem_size = sizeof(s32); break; } /* Sanity checks */ if (id == 0 || name == NULL || !elem_size || id >= V4L2_CID_PRIVATE_BASE || (type == V4L2_CTRL_TYPE_MENU && qmenu == NULL) || (type == V4L2_CTRL_TYPE_INTEGER_MENU && qmenu_int == NULL)) { handler_set_err(hdl, -ERANGE); return NULL; } err = check_range(type, min, max, step, def); if (err) { handler_set_err(hdl, err); return NULL; } if (is_array && (type == V4L2_CTRL_TYPE_BUTTON || type == V4L2_CTRL_TYPE_CTRL_CLASS)) { handler_set_err(hdl, -EINVAL); return NULL; } if (flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) { /* * For now only support this for one-dimensional arrays only. * * This can be relaxed in the future, but this will * require more effort. */ if (nr_of_dims != 1) { handler_set_err(hdl, -EINVAL); return NULL; } /* Start with just 1 element */ elems = 1; } tot_ctrl_size = elem_size * elems; sz_extra = 0; if (type == V4L2_CTRL_TYPE_BUTTON) flags |= V4L2_CTRL_FLAG_WRITE_ONLY | V4L2_CTRL_FLAG_EXECUTE_ON_WRITE; else if (type == V4L2_CTRL_TYPE_CTRL_CLASS) flags |= V4L2_CTRL_FLAG_READ_ONLY; else if (!is_array && (type == V4L2_CTRL_TYPE_INTEGER64 || type == V4L2_CTRL_TYPE_STRING || type >= V4L2_CTRL_COMPOUND_TYPES)) sz_extra += 2 * tot_ctrl_size; if (type >= V4L2_CTRL_COMPOUND_TYPES && p_def.p_const) sz_extra += elem_size; ctrl = kvzalloc(sizeof(*ctrl) + sz_extra, GFP_KERNEL); if (ctrl == NULL) { handler_set_err(hdl, -ENOMEM); return NULL; } INIT_LIST_HEAD(&ctrl->node); INIT_LIST_HEAD(&ctrl->ev_subs); ctrl->handler = hdl; ctrl->ops = ops; ctrl->type_ops = type_ops ? type_ops : &std_type_ops; ctrl->id = id; ctrl->name = name; ctrl->type = type; ctrl->flags = flags; ctrl->minimum = min; ctrl->maximum = max; ctrl->step = step; ctrl->default_value = def; ctrl->is_string = !is_array && type == V4L2_CTRL_TYPE_STRING; ctrl->is_ptr = is_array || type >= V4L2_CTRL_COMPOUND_TYPES || ctrl->is_string; ctrl->is_int = !ctrl->is_ptr && type != V4L2_CTRL_TYPE_INTEGER64; ctrl->is_array = is_array; ctrl->is_dyn_array = !!(flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY); ctrl->elems = elems; ctrl->new_elems = elems; ctrl->nr_of_dims = nr_of_dims; if (nr_of_dims) memcpy(ctrl->dims, dims, nr_of_dims * sizeof(dims[0])); ctrl->elem_size = elem_size; if (type == V4L2_CTRL_TYPE_MENU) ctrl->qmenu = qmenu; else if (type == V4L2_CTRL_TYPE_INTEGER_MENU) ctrl->qmenu_int = qmenu_int; ctrl->priv = priv; ctrl->cur.val = ctrl->val = def; data = &ctrl[1]; if (ctrl->is_array) { ctrl->p_array_alloc_elems = elems; ctrl->p_array = kvzalloc(2 * elems * elem_size, GFP_KERNEL); if (!ctrl->p_array) { kvfree(ctrl); return NULL; } data = ctrl->p_array; } if (!ctrl->is_int) { ctrl->p_new.p = data; ctrl->p_cur.p = data + tot_ctrl_size; } else { ctrl->p_new.p = &ctrl->val; ctrl->p_cur.p = &ctrl->cur.val; } if (type >= V4L2_CTRL_COMPOUND_TYPES && p_def.p_const) { if (ctrl->is_array) ctrl->p_def.p = &ctrl[1]; else ctrl->p_def.p = ctrl->p_cur.p + tot_ctrl_size; memcpy(ctrl->p_def.p, p_def.p_const, elem_size); } ctrl->type_ops->init(ctrl, 0, ctrl->p_cur); cur_to_new(ctrl); if (handler_new_ref(hdl, ctrl, NULL, false, false)) { kvfree(ctrl->p_array); kvfree(ctrl); return NULL; } mutex_lock(hdl->lock); list_add_tail(&ctrl->node, &hdl->ctrls); mutex_unlock(hdl->lock); return ctrl; } struct v4l2_ctrl *v4l2_ctrl_new_custom(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_config *cfg, void *priv) { bool is_menu; struct v4l2_ctrl *ctrl; const char *name = cfg->name; const char * const *qmenu = cfg->qmenu; const s64 *qmenu_int = cfg->qmenu_int; enum v4l2_ctrl_type type = cfg->type; u32 flags = cfg->flags; s64 min = cfg->min; s64 max = cfg->max; u64 step = cfg->step; s64 def = cfg->def; if (name == NULL) v4l2_ctrl_fill(cfg->id, &name, &type, &min, &max, &step, &def, &flags); is_menu = (type == V4L2_CTRL_TYPE_MENU || type == V4L2_CTRL_TYPE_INTEGER_MENU); if (is_menu) WARN_ON(step); else WARN_ON(cfg->menu_skip_mask); if (type == V4L2_CTRL_TYPE_MENU && !qmenu) { qmenu = v4l2_ctrl_get_menu(cfg->id); } else if (type == V4L2_CTRL_TYPE_INTEGER_MENU && !qmenu_int) { handler_set_err(hdl, -EINVAL); return NULL; } ctrl = v4l2_ctrl_new(hdl, cfg->ops, cfg->type_ops, cfg->id, name, type, min, max, is_menu ? cfg->menu_skip_mask : step, def, cfg->dims, cfg->elem_size, flags, qmenu, qmenu_int, cfg->p_def, priv); if (ctrl) ctrl->is_private = cfg->is_private; return ctrl; } EXPORT_SYMBOL(v4l2_ctrl_new_custom); /* Helper function for standard non-menu controls */ struct v4l2_ctrl *v4l2_ctrl_new_std(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, s64 min, s64 max, u64 step, s64 def) { const char *name; enum v4l2_ctrl_type type; u32 flags; v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type == V4L2_CTRL_TYPE_MENU || type == V4L2_CTRL_TYPE_INTEGER_MENU || type >= V4L2_CTRL_COMPOUND_TYPES) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, min, max, step, def, NULL, 0, flags, NULL, NULL, ptr_null, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_std); /* Helper function for standard menu controls */ struct v4l2_ctrl *v4l2_ctrl_new_std_menu(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 _max, u64 mask, u8 _def) { const char * const *qmenu = NULL; const s64 *qmenu_int = NULL; unsigned int qmenu_int_len = 0; const char *name; enum v4l2_ctrl_type type; s64 min; s64 max = _max; s64 def = _def; u64 step; u32 flags; v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type == V4L2_CTRL_TYPE_MENU) qmenu = v4l2_ctrl_get_menu(id); else if (type == V4L2_CTRL_TYPE_INTEGER_MENU) qmenu_int = v4l2_ctrl_get_int_menu(id, &qmenu_int_len); if ((!qmenu && !qmenu_int) || (qmenu_int && max >= qmenu_int_len)) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, 0, max, mask, def, NULL, 0, flags, qmenu, qmenu_int, ptr_null, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_std_menu); /* Helper function for standard menu controls with driver defined menu */ struct v4l2_ctrl *v4l2_ctrl_new_std_menu_items(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 _max, u64 mask, u8 _def, const char * const *qmenu) { enum v4l2_ctrl_type type; const char *name; u32 flags; u64 step; s64 min; s64 max = _max; s64 def = _def; /* v4l2_ctrl_new_std_menu_items() should only be called for * standard controls without a standard menu. */ if (v4l2_ctrl_get_menu(id)) { handler_set_err(hdl, -EINVAL); return NULL; } v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type != V4L2_CTRL_TYPE_MENU || qmenu == NULL) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, 0, max, mask, def, NULL, 0, flags, qmenu, NULL, ptr_null, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_std_menu_items); /* Helper function for standard compound controls */ struct v4l2_ctrl *v4l2_ctrl_new_std_compound(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, const union v4l2_ctrl_ptr p_def) { const char *name; enum v4l2_ctrl_type type; u32 flags; s64 min, max, step, def; v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type < V4L2_CTRL_COMPOUND_TYPES) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, min, max, step, def, NULL, 0, flags, NULL, NULL, p_def, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_std_compound); /* Helper function for standard integer menu controls */ struct v4l2_ctrl *v4l2_ctrl_new_int_menu(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ops, u32 id, u8 _max, u8 _def, const s64 *qmenu_int) { const char *name; enum v4l2_ctrl_type type; s64 min; u64 step; s64 max = _max; s64 def = _def; u32 flags; v4l2_ctrl_fill(id, &name, &type, &min, &max, &step, &def, &flags); if (type != V4L2_CTRL_TYPE_INTEGER_MENU) { handler_set_err(hdl, -EINVAL); return NULL; } return v4l2_ctrl_new(hdl, ops, NULL, id, name, type, 0, max, 0, def, NULL, 0, flags, NULL, qmenu_int, ptr_null, NULL); } EXPORT_SYMBOL(v4l2_ctrl_new_int_menu); /* Add the controls from another handler to our own. */ int v4l2_ctrl_add_handler(struct v4l2_ctrl_handler *hdl, struct v4l2_ctrl_handler *add, bool (*filter)(const struct v4l2_ctrl *ctrl), bool from_other_dev) { struct v4l2_ctrl_ref *ref; int ret = 0; /* Do nothing if either handler is NULL or if they are the same */ if (!hdl || !add || hdl == add) return 0; if (hdl->error) return hdl->error; mutex_lock(add->lock); list_for_each_entry(ref, &add->ctrl_refs, node) { struct v4l2_ctrl *ctrl = ref->ctrl; /* Skip handler-private controls. */ if (ctrl->is_private) continue; /* And control classes */ if (ctrl->type == V4L2_CTRL_TYPE_CTRL_CLASS) continue; /* Filter any unwanted controls */ if (filter && !filter(ctrl)) continue; ret = handler_new_ref(hdl, ctrl, NULL, from_other_dev, false); if (ret) break; } mutex_unlock(add->lock); return ret; } EXPORT_SYMBOL(v4l2_ctrl_add_handler); bool v4l2_ctrl_radio_filter(const struct v4l2_ctrl *ctrl) { if (V4L2_CTRL_ID2WHICH(ctrl->id) == V4L2_CTRL_CLASS_FM_TX) return true; if (V4L2_CTRL_ID2WHICH(ctrl->id) == V4L2_CTRL_CLASS_FM_RX) return true; switch (ctrl->id) { case V4L2_CID_AUDIO_MUTE: case V4L2_CID_AUDIO_VOLUME: case V4L2_CID_AUDIO_BALANCE: case V4L2_CID_AUDIO_BASS: case V4L2_CID_AUDIO_TREBLE: case V4L2_CID_AUDIO_LOUDNESS: return true; default: break; } return false; } EXPORT_SYMBOL(v4l2_ctrl_radio_filter); /* Cluster controls */ void v4l2_ctrl_cluster(unsigned ncontrols, struct v4l2_ctrl **controls) { bool has_volatiles = false; int i; /* The first control is the master control and it must not be NULL */ if (WARN_ON(ncontrols == 0 || controls[0] == NULL)) return; for (i = 0; i < ncontrols; i++) { if (controls[i]) { controls[i]->cluster = controls; controls[i]->ncontrols = ncontrols; if (controls[i]->flags & V4L2_CTRL_FLAG_VOLATILE) has_volatiles = true; } } controls[0]->has_volatiles = has_volatiles; } EXPORT_SYMBOL(v4l2_ctrl_cluster); void v4l2_ctrl_auto_cluster(unsigned ncontrols, struct v4l2_ctrl **controls, u8 manual_val, bool set_volatile) { struct v4l2_ctrl *master = controls[0]; u32 flag = 0; int i; v4l2_ctrl_cluster(ncontrols, controls); WARN_ON(ncontrols <= 1); WARN_ON(manual_val < master->minimum || manual_val > master->maximum); WARN_ON(set_volatile && !has_op(master, g_volatile_ctrl)); master->is_auto = true; master->has_volatiles = set_volatile; master->manual_mode_value = manual_val; master->flags |= V4L2_CTRL_FLAG_UPDATE; if (!is_cur_manual(master)) flag = V4L2_CTRL_FLAG_INACTIVE | (set_volatile ? V4L2_CTRL_FLAG_VOLATILE : 0); for (i = 1; i < ncontrols; i++) if (controls[i]) controls[i]->flags |= flag; } EXPORT_SYMBOL(v4l2_ctrl_auto_cluster); /* * Obtain the current volatile values of an autocluster and mark them * as new. */ void update_from_auto_cluster(struct v4l2_ctrl *master) { int i; for (i = 1; i < master->ncontrols; i++) cur_to_new(master->cluster[i]); if (!call_op(master, g_volatile_ctrl)) for (i = 1; i < master->ncontrols; i++) if (master->cluster[i]) master->cluster[i]->is_new = 1; } /* * Return non-zero if one or more of the controls in the cluster has a new * value that differs from the current value. */ static int cluster_changed(struct v4l2_ctrl *master) { bool changed = false; int i; for (i = 0; i < master->ncontrols; i++) { struct v4l2_ctrl *ctrl = master->cluster[i]; bool ctrl_changed = false; if (!ctrl) continue; if (ctrl->flags & V4L2_CTRL_FLAG_EXECUTE_ON_WRITE) { changed = true; ctrl_changed = true; } /* * Set has_changed to false to avoid generating * the event V4L2_EVENT_CTRL_CH_VALUE */ if (ctrl->flags & V4L2_CTRL_FLAG_VOLATILE) { ctrl->has_changed = false; continue; } if (ctrl->elems != ctrl->new_elems) ctrl_changed = true; if (!ctrl_changed) ctrl_changed = !ctrl->type_ops->equal(ctrl, ctrl->p_cur, ctrl->p_new); ctrl->has_changed = ctrl_changed; changed |= ctrl->has_changed; } return changed; } /* * Core function that calls try/s_ctrl and ensures that the new value is * copied to the current value on a set. * Must be called with ctrl->handler->lock held. */ int try_or_set_cluster(struct v4l2_fh *fh, struct v4l2_ctrl *master, bool set, u32 ch_flags) { bool update_flag; int ret; int i; /* * Go through the cluster and either validate the new value or * (if no new value was set), copy the current value to the new * value, ensuring a consistent view for the control ops when * called. */ for (i = 0; i < master->ncontrols; i++) { struct v4l2_ctrl *ctrl = master->cluster[i]; if (!ctrl) continue; if (!ctrl->is_new) { cur_to_new(ctrl); continue; } /* * Check again: it may have changed since the * previous check in try_or_set_ext_ctrls(). */ if (set && (ctrl->flags & V4L2_CTRL_FLAG_GRABBED)) return -EBUSY; } ret = call_op(master, try_ctrl); /* Don't set if there is no change */ if (ret || !set || !cluster_changed(master)) return ret; ret = call_op(master, s_ctrl); if (ret) return ret; /* If OK, then make the new values permanent. */ update_flag = is_cur_manual(master) != is_new_manual(master); for (i = 0; i < master->ncontrols; i++) { /* * If we switch from auto to manual mode, and this cluster * contains volatile controls, then all non-master controls * have to be marked as changed. The 'new' value contains * the volatile value (obtained by update_from_auto_cluster), * which now has to become the current value. */ if (i && update_flag && is_new_manual(master) && master->has_volatiles && master->cluster[i]) master->cluster[i]->has_changed = true; new_to_cur(fh, master->cluster[i], ch_flags | ((update_flag && i > 0) ? V4L2_EVENT_CTRL_CH_FLAGS : 0)); } return 0; } /* Activate/deactivate a control. */ void v4l2_ctrl_activate(struct v4l2_ctrl *ctrl, bool active) { /* invert since the actual flag is called 'inactive' */ bool inactive = !active; bool old; if (ctrl == NULL) return; if (inactive) /* set V4L2_CTRL_FLAG_INACTIVE */ old = test_and_set_bit(4, &ctrl->flags); else /* clear V4L2_CTRL_FLAG_INACTIVE */ old = test_and_clear_bit(4, &ctrl->flags); if (old != inactive) send_event(NULL, ctrl, V4L2_EVENT_CTRL_CH_FLAGS); } EXPORT_SYMBOL(v4l2_ctrl_activate); void __v4l2_ctrl_grab(struct v4l2_ctrl *ctrl, bool grabbed) { bool old; if (ctrl == NULL) return; lockdep_assert_held(ctrl->handler->lock); if (grabbed) /* set V4L2_CTRL_FLAG_GRABBED */ old = test_and_set_bit(1, &ctrl->flags); else /* clear V4L2_CTRL_FLAG_GRABBED */ old = test_and_clear_bit(1, &ctrl->flags); if (old != grabbed) send_event(NULL, ctrl, V4L2_EVENT_CTRL_CH_FLAGS); } EXPORT_SYMBOL(__v4l2_ctrl_grab); /* Call s_ctrl for all controls owned by the handler */ int __v4l2_ctrl_handler_setup(struct v4l2_ctrl_handler *hdl) { struct v4l2_ctrl *ctrl; int ret = 0; if (hdl == NULL) return 0; lockdep_assert_held(hdl->lock); list_for_each_entry(ctrl, &hdl->ctrls, node) ctrl->done = false; list_for_each_entry(ctrl, &hdl->ctrls, node) { struct v4l2_ctrl *master = ctrl->cluster[0]; int i; /* Skip if this control was already handled by a cluster. */ /* Skip button controls and read-only controls. */ if (ctrl->done || ctrl->type == V4L2_CTRL_TYPE_BUTTON || (ctrl->flags & V4L2_CTRL_FLAG_READ_ONLY)) continue; for (i = 0; i < master->ncontrols; i++) { if (master->cluster[i]) { cur_to_new(master->cluster[i]); master->cluster[i]->is_new = 1; master->cluster[i]->done = true; } } ret = call_op(master, s_ctrl); if (ret) break; } return ret; } EXPORT_SYMBOL_GPL(__v4l2_ctrl_handler_setup); int v4l2_ctrl_handler_setup(struct v4l2_ctrl_handler *hdl) { int ret; if (hdl == NULL) return 0; mutex_lock(hdl->lock); ret = __v4l2_ctrl_handler_setup(hdl); mutex_unlock(hdl->lock); return ret; } EXPORT_SYMBOL(v4l2_ctrl_handler_setup); /* Log the control name and value */ static void log_ctrl(const struct v4l2_ctrl *ctrl, const char *prefix, const char *colon) { if (ctrl->flags & (V4L2_CTRL_FLAG_DISABLED | V4L2_CTRL_FLAG_WRITE_ONLY)) return; if (ctrl->type == V4L2_CTRL_TYPE_CTRL_CLASS) return; pr_info("%s%s%s: ", prefix, colon, ctrl->name); ctrl->type_ops->log(ctrl); if (ctrl->flags & (V4L2_CTRL_FLAG_INACTIVE | V4L2_CTRL_FLAG_GRABBED | V4L2_CTRL_FLAG_VOLATILE)) { if (ctrl->flags & V4L2_CTRL_FLAG_INACTIVE) pr_cont(" inactive"); if (ctrl->flags & V4L2_CTRL_FLAG_GRABBED) pr_cont(" grabbed"); if (ctrl->flags & V4L2_CTRL_FLAG_VOLATILE) pr_cont(" volatile"); } pr_cont("\n"); } /* Log all controls owned by the handler */ void v4l2_ctrl_handler_log_status(struct v4l2_ctrl_handler *hdl, const char *prefix) { struct v4l2_ctrl *ctrl; const char *colon = ""; int len; if (!hdl) return; if (!prefix) prefix = ""; len = strlen(prefix); if (len && prefix[len - 1] != ' ') colon = ": "; mutex_lock(hdl->lock); list_for_each_entry(ctrl, &hdl->ctrls, node) if (!(ctrl->flags & V4L2_CTRL_FLAG_DISABLED)) log_ctrl(ctrl, prefix, colon); mutex_unlock(hdl->lock); } EXPORT_SYMBOL(v4l2_ctrl_handler_log_status); int v4l2_ctrl_new_fwnode_properties(struct v4l2_ctrl_handler *hdl, const struct v4l2_ctrl_ops *ctrl_ops, const struct v4l2_fwnode_device_properties *p) { if (hdl->error) return hdl->error; if (p->orientation != V4L2_FWNODE_PROPERTY_UNSET) { u32 orientation_ctrl; switch (p->orientation) { case V4L2_FWNODE_ORIENTATION_FRONT: orientation_ctrl = V4L2_CAMERA_ORIENTATION_FRONT; break; case V4L2_FWNODE_ORIENTATION_BACK: orientation_ctrl = V4L2_CAMERA_ORIENTATION_BACK; break; case V4L2_FWNODE_ORIENTATION_EXTERNAL: orientation_ctrl = V4L2_CAMERA_ORIENTATION_EXTERNAL; break; default: return -EINVAL; } if (!v4l2_ctrl_new_std_menu(hdl, ctrl_ops, V4L2_CID_CAMERA_ORIENTATION, V4L2_CAMERA_ORIENTATION_EXTERNAL, 0, orientation_ctrl)) return hdl->error; } if (p->rotation != V4L2_FWNODE_PROPERTY_UNSET) { if (!v4l2_ctrl_new_std(hdl, ctrl_ops, V4L2_CID_CAMERA_SENSOR_ROTATION, p->rotation, p->rotation, 1, p->rotation)) return hdl->error; } return hdl->error; } EXPORT_SYMBOL(v4l2_ctrl_new_fwnode_properties);
23 23 23 23 20 23 21 21 8 8 8 8 13 4 1 9 2 1 2 13 5 4 4 4 4 4 4 4 2 4 4 4 37 28 37 28 28 20 20 16 16 26 25 24 23 23 23 23 23 23 8 20 20 20 20 20 4 4 4 4 4 9 8 9 9 3 9 1 8 1 7 6 6 6 6 5 4 4 4 4 4 36 36 36 36 35 34 34 33 32 33 32 26 9 33 36 36 36 6 6 6 6 6 4 1 3 3 3 3 2 2 2 4 19 17 17 17 17 17 17 17 2 17 17 17 15 17 2 3 17 17 15 17 16 16 16 16 4 4 4 17 17 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 4 4 4 4 3 3 3 3 2 2 2 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/pci.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/skbuff.h> #include "vmci_handle_array.h" #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_resource.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" #include "vmci_route.h" /* * In the following, we will distinguish between two kinds of VMX processes - * the ones with versions lower than VMCI_VERSION_NOVMVM that use specialized * VMCI page files in the VMX and supporting VM to VM communication and the * newer ones that use the guest memory directly. We will in the following * refer to the older VMX versions as old-style VMX'en, and the newer ones as * new-style VMX'en. * * The state transition datagram is as follows (the VMCIQPB_ prefix has been * removed for readability) - see below for more details on the transtions: * * -------------- NEW ------------- * | | * \_/ \_/ * CREATED_NO_MEM <-----------------> CREATED_MEM * | | | * | o-----------------------o | * | | | * \_/ \_/ \_/ * ATTACHED_NO_MEM <----------------> ATTACHED_MEM * | | | * | o----------------------o | * | | | * \_/ \_/ \_/ * SHUTDOWN_NO_MEM <----------------> SHUTDOWN_MEM * | | * | | * -------------> gone <------------- * * In more detail. When a VMCI queue pair is first created, it will be in the * VMCIQPB_NEW state. It will then move into one of the following states: * * - VMCIQPB_CREATED_NO_MEM: this state indicates that either: * * - the created was performed by a host endpoint, in which case there is * no backing memory yet. * * - the create was initiated by an old-style VMX, that uses * vmci_qp_broker_set_page_store to specify the UVAs of the queue pair at * a later point in time. This state can be distinguished from the one * above by the context ID of the creator. A host side is not allowed to * attach until the page store has been set. * * - VMCIQPB_CREATED_MEM: this state is the result when the queue pair * is created by a VMX using the queue pair device backend that * sets the UVAs of the queue pair immediately and stores the * information for later attachers. At this point, it is ready for * the host side to attach to it. * * Once the queue pair is in one of the created states (with the exception of * the case mentioned for older VMX'en above), it is possible to attach to the * queue pair. Again we have two new states possible: * * - VMCIQPB_ATTACHED_MEM: this state can be reached through the following * paths: * * - from VMCIQPB_CREATED_NO_MEM when a new-style VMX allocates a queue * pair, and attaches to a queue pair previously created by the host side. * * - from VMCIQPB_CREATED_MEM when the host side attaches to a queue pair * already created by a guest. * * - from VMCIQPB_ATTACHED_NO_MEM, when an old-style VMX calls * vmci_qp_broker_set_page_store (see below). * * - VMCIQPB_ATTACHED_NO_MEM: If the queue pair already was in the * VMCIQPB_CREATED_NO_MEM due to a host side create, an old-style VMX will * bring the queue pair into this state. Once vmci_qp_broker_set_page_store * is called to register the user memory, the VMCIQPB_ATTACH_MEM state * will be entered. * * From the attached queue pair, the queue pair can enter the shutdown states * when either side of the queue pair detaches. If the guest side detaches * first, the queue pair will enter the VMCIQPB_SHUTDOWN_NO_MEM state, where * the content of the queue pair will no longer be available. If the host * side detaches first, the queue pair will either enter the * VMCIQPB_SHUTDOWN_MEM, if the guest memory is currently mapped, or * VMCIQPB_SHUTDOWN_NO_MEM, if the guest memory is not mapped * (e.g., the host detaches while a guest is stunned). * * New-style VMX'en will also unmap guest memory, if the guest is * quiesced, e.g., during a snapshot operation. In that case, the guest * memory will no longer be available, and the queue pair will transition from * *_MEM state to a *_NO_MEM state. The VMX may later map the memory once more, * in which case the queue pair will transition from the *_NO_MEM state at that * point back to the *_MEM state. Note that the *_NO_MEM state may have changed, * since the peer may have either attached or detached in the meantime. The * values are laid out such that ++ on a state will move from a *_NO_MEM to a * *_MEM state, and vice versa. */ /* The Kernel specific component of the struct vmci_queue structure. */ struct vmci_queue_kern_if { struct mutex __mutex; /* Protects the queue. */ struct mutex *mutex; /* Shared by producer and consumer queues. */ size_t num_pages; /* Number of pages incl. header. */ bool host; /* Host or guest? */ union { struct { dma_addr_t *pas; void **vas; } g; /* Used by the guest. */ struct { struct page **page; struct page **header_page; } h; /* Used by the host. */ } u; }; /* * This structure is opaque to the clients. */ struct vmci_qp { struct vmci_handle handle; struct vmci_queue *produce_q; struct vmci_queue *consume_q; u64 produce_q_size; u64 consume_q_size; u32 peer; u32 flags; u32 priv_flags; bool guest_endpoint; unsigned int blocked; unsigned int generation; wait_queue_head_t event; }; enum qp_broker_state { VMCIQPB_NEW, VMCIQPB_CREATED_NO_MEM, VMCIQPB_CREATED_MEM, VMCIQPB_ATTACHED_NO_MEM, VMCIQPB_ATTACHED_MEM, VMCIQPB_SHUTDOWN_NO_MEM, VMCIQPB_SHUTDOWN_MEM, VMCIQPB_GONE }; #define QPBROKERSTATE_HAS_MEM(_qpb) (_qpb->state == VMCIQPB_CREATED_MEM || \ _qpb->state == VMCIQPB_ATTACHED_MEM || \ _qpb->state == VMCIQPB_SHUTDOWN_MEM) /* * In the queue pair broker, we always use the guest point of view for * the produce and consume queue values and references, e.g., the * produce queue size stored is the guests produce queue size. The * host endpoint will need to swap these around. The only exception is * the local queue pairs on the host, in which case the host endpoint * that creates the queue pair will have the right orientation, and * the attaching host endpoint will need to swap. */ struct qp_entry { struct list_head list_item; struct vmci_handle handle; u32 peer; u32 flags; u64 produce_size; u64 consume_size; u32 ref_count; }; struct qp_broker_entry { struct vmci_resource resource; struct qp_entry qp; u32 create_id; u32 attach_id; enum qp_broker_state state; bool require_trusted_attach; bool created_by_trusted; bool vmci_page_files; /* Created by VMX using VMCI page files */ struct vmci_queue *produce_q; struct vmci_queue *consume_q; struct vmci_queue_header saved_produce_q; struct vmci_queue_header saved_consume_q; vmci_event_release_cb wakeup_cb; void *client_data; void *local_mem; /* Kernel memory for local queue pair */ }; struct qp_guest_endpoint { struct vmci_resource resource; struct qp_entry qp; u64 num_ppns; void *produce_q; void *consume_q; struct ppn_set ppn_set; }; struct qp_list { struct list_head head; struct mutex mutex; /* Protect queue list. */ }; static struct qp_list qp_broker_list = { .head = LIST_HEAD_INIT(qp_broker_list.head), .mutex = __MUTEX_INITIALIZER(qp_broker_list.mutex), }; static struct qp_list qp_guest_endpoints = { .head = LIST_HEAD_INIT(qp_guest_endpoints.head), .mutex = __MUTEX_INITIALIZER(qp_guest_endpoints.mutex), }; #define INVALID_VMCI_GUEST_MEM_ID 0 #define QPE_NUM_PAGES(_QPE) ((u32) \ (DIV_ROUND_UP(_QPE.produce_size, PAGE_SIZE) + \ DIV_ROUND_UP(_QPE.consume_size, PAGE_SIZE) + 2)) #define QP_SIZES_ARE_VALID(_prod_qsize, _cons_qsize) \ ((_prod_qsize) + (_cons_qsize) >= max(_prod_qsize, _cons_qsize) && \ (_prod_qsize) + (_cons_qsize) <= VMCI_MAX_GUEST_QP_MEMORY) /* * Frees kernel VA space for a given queue and its queue header, and * frees physical data pages. */ static void qp_free_queue(void *q, u64 size) { struct vmci_queue *queue = q; if (queue) { u64 i; /* Given size does not include header, so add in a page here. */ for (i = 0; i < DIV_ROUND_UP(size, PAGE_SIZE) + 1; i++) { dma_free_coherent(&vmci_pdev->dev, PAGE_SIZE, queue->kernel_if->u.g.vas[i], queue->kernel_if->u.g.pas[i]); } vfree(queue); } } /* * Allocates kernel queue pages of specified size with IOMMU mappings, * plus space for the queue structure/kernel interface and the queue * header. */ static void *qp_alloc_queue(u64 size, u32 flags) { u64 i; struct vmci_queue *queue; size_t pas_size; size_t vas_size; size_t queue_size = sizeof(*queue) + sizeof(*queue->kernel_if); u64 num_pages; if (size > SIZE_MAX - PAGE_SIZE) return NULL; num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / (sizeof(*queue->kernel_if->u.g.pas) + sizeof(*queue->kernel_if->u.g.vas))) return NULL; pas_size = num_pages * sizeof(*queue->kernel_if->u.g.pas); vas_size = num_pages * sizeof(*queue->kernel_if->u.g.vas); queue_size += pas_size + vas_size; queue = vmalloc(queue_size); if (!queue) return NULL; queue->q_header = NULL; queue->saved_header = NULL; queue->kernel_if = (struct vmci_queue_kern_if *)(queue + 1); queue->kernel_if->mutex = NULL; queue->kernel_if->num_pages = num_pages; queue->kernel_if->u.g.pas = (dma_addr_t *)(queue->kernel_if + 1); queue->kernel_if->u.g.vas = (void **)((u8 *)queue->kernel_if->u.g.pas + pas_size); queue->kernel_if->host = false; for (i = 0; i < num_pages; i++) { queue->kernel_if->u.g.vas[i] = dma_alloc_coherent(&vmci_pdev->dev, PAGE_SIZE, &queue->kernel_if->u.g.pas[i], GFP_KERNEL); if (!queue->kernel_if->u.g.vas[i]) { /* Size excl. the header. */ qp_free_queue(queue, i * PAGE_SIZE); return NULL; } } /* Queue header is the first page. */ queue->q_header = queue->kernel_if->u.g.vas[0]; return queue; } /* * Copies from a given buffer or iovector to a VMCI Queue. Uses * kmap_local_page() to dynamically map required portions of the queue * by traversing the offset -> page translation structure for the queue. * Assumes that offset + size does not wrap around in the queue. */ static int qp_memcpy_to_queue_iter(struct vmci_queue *queue, u64 queue_offset, struct iov_iter *from, size_t size) { struct vmci_queue_kern_if *kernel_if = queue->kernel_if; size_t bytes_copied = 0; while (bytes_copied < size) { const u64 page_index = (queue_offset + bytes_copied) / PAGE_SIZE; const size_t page_offset = (queue_offset + bytes_copied) & (PAGE_SIZE - 1); void *va; size_t to_copy; if (kernel_if->host) va = kmap_local_page(kernel_if->u.h.page[page_index]); else va = kernel_if->u.g.vas[page_index + 1]; /* Skip header. */ if (size - bytes_copied > PAGE_SIZE - page_offset) /* Enough payload to fill up from this page. */ to_copy = PAGE_SIZE - page_offset; else to_copy = size - bytes_copied; if (!copy_from_iter_full((u8 *)va + page_offset, to_copy, from)) { if (kernel_if->host) kunmap_local(va); return VMCI_ERROR_INVALID_ARGS; } bytes_copied += to_copy; if (kernel_if->host) kunmap_local(va); } return VMCI_SUCCESS; } /* * Copies to a given buffer or iovector from a VMCI Queue. Uses * kmap_local_page() to dynamically map required portions of the queue * by traversing the offset -> page translation structure for the queue. * Assumes that offset + size does not wrap around in the queue. */ static int qp_memcpy_from_queue_iter(struct iov_iter *to, const struct vmci_queue *queue, u64 queue_offset, size_t size) { struct vmci_queue_kern_if *kernel_if = queue->kernel_if; size_t bytes_copied = 0; while (bytes_copied < size) { const u64 page_index = (queue_offset + bytes_copied) / PAGE_SIZE; const size_t page_offset = (queue_offset + bytes_copied) & (PAGE_SIZE - 1); void *va; size_t to_copy; int err; if (kernel_if->host) va = kmap_local_page(kernel_if->u.h.page[page_index]); else va = kernel_if->u.g.vas[page_index + 1]; /* Skip header. */ if (size - bytes_copied > PAGE_SIZE - page_offset) /* Enough payload to fill up this page. */ to_copy = PAGE_SIZE - page_offset; else to_copy = size - bytes_copied; err = copy_to_iter((u8 *)va + page_offset, to_copy, to); if (err != to_copy) { if (kernel_if->host) kunmap_local(va); return VMCI_ERROR_INVALID_ARGS; } bytes_copied += to_copy; if (kernel_if->host) kunmap_local(va); } return VMCI_SUCCESS; } /* * Allocates two list of PPNs --- one for the pages in the produce queue, * and the other for the pages in the consume queue. Intializes the list * of PPNs with the page frame numbers of the KVA for the two queues (and * the queue headers). */ static int qp_alloc_ppn_set(void *prod_q, u64 num_produce_pages, void *cons_q, u64 num_consume_pages, struct ppn_set *ppn_set) { u64 *produce_ppns; u64 *consume_ppns; struct vmci_queue *produce_q = prod_q; struct vmci_queue *consume_q = cons_q; u64 i; if (!produce_q || !num_produce_pages || !consume_q || !num_consume_pages || !ppn_set) return VMCI_ERROR_INVALID_ARGS; if (ppn_set->initialized) return VMCI_ERROR_ALREADY_EXISTS; produce_ppns = kmalloc_array(num_produce_pages, sizeof(*produce_ppns), GFP_KERNEL); if (!produce_ppns) return VMCI_ERROR_NO_MEM; consume_ppns = kmalloc_array(num_consume_pages, sizeof(*consume_ppns), GFP_KERNEL); if (!consume_ppns) { kfree(produce_ppns); return VMCI_ERROR_NO_MEM; } for (i = 0; i < num_produce_pages; i++) produce_ppns[i] = produce_q->kernel_if->u.g.pas[i] >> PAGE_SHIFT; for (i = 0; i < num_consume_pages; i++) consume_ppns[i] = consume_q->kernel_if->u.g.pas[i] >> PAGE_SHIFT; ppn_set->num_produce_pages = num_produce_pages; ppn_set->num_consume_pages = num_consume_pages; ppn_set->produce_ppns = produce_ppns; ppn_set->consume_ppns = consume_ppns; ppn_set->initialized = true; return VMCI_SUCCESS; } /* * Frees the two list of PPNs for a queue pair. */ static void qp_free_ppn_set(struct ppn_set *ppn_set) { if (ppn_set->initialized) { /* Do not call these functions on NULL inputs. */ kfree(ppn_set->produce_ppns); kfree(ppn_set->consume_ppns); } memset(ppn_set, 0, sizeof(*ppn_set)); } /* * Populates the list of PPNs in the hypercall structure with the PPNS * of the produce queue and the consume queue. */ static int qp_populate_ppn_set(u8 *call_buf, const struct ppn_set *ppn_set) { if (vmci_use_ppn64()) { memcpy(call_buf, ppn_set->produce_ppns, ppn_set->num_produce_pages * sizeof(*ppn_set->produce_ppns)); memcpy(call_buf + ppn_set->num_produce_pages * sizeof(*ppn_set->produce_ppns), ppn_set->consume_ppns, ppn_set->num_consume_pages * sizeof(*ppn_set->consume_ppns)); } else { int i; u32 *ppns = (u32 *) call_buf; for (i = 0; i < ppn_set->num_produce_pages; i++) ppns[i] = (u32) ppn_set->produce_ppns[i]; ppns = &ppns[ppn_set->num_produce_pages]; for (i = 0; i < ppn_set->num_consume_pages; i++) ppns[i] = (u32) ppn_set->consume_ppns[i]; } return VMCI_SUCCESS; } /* * Allocates kernel VA space of specified size plus space for the queue * and kernel interface. This is different from the guest queue allocator, * because we do not allocate our own queue header/data pages here but * share those of the guest. */ static struct vmci_queue *qp_host_alloc_queue(u64 size) { struct vmci_queue *queue; size_t queue_page_size; u64 num_pages; const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if)); if (size > min_t(size_t, VMCI_MAX_GUEST_QP_MEMORY, SIZE_MAX - PAGE_SIZE)) return NULL; num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / sizeof(*queue->kernel_if->u.h.page)) return NULL; queue_page_size = num_pages * sizeof(*queue->kernel_if->u.h.page); if (queue_size + queue_page_size > KMALLOC_MAX_SIZE) return NULL; queue = kzalloc(queue_size + queue_page_size, GFP_KERNEL); if (queue) { queue->q_header = NULL; queue->saved_header = NULL; queue->kernel_if = (struct vmci_queue_kern_if *)(queue + 1); queue->kernel_if->host = true; queue->kernel_if->mutex = NULL; queue->kernel_if->num_pages = num_pages; queue->kernel_if->u.h.header_page = (struct page **)((u8 *)queue + queue_size); queue->kernel_if->u.h.page = &queue->kernel_if->u.h.header_page[1]; } return queue; } /* * Frees kernel memory for a given queue (header plus translation * structure). */ static void qp_host_free_queue(struct vmci_queue *queue, u64 queue_size) { kfree(queue); } /* * Initialize the mutex for the pair of queues. This mutex is used to * protect the q_header and the buffer from changing out from under any * users of either queue. Of course, it's only any good if the mutexes * are actually acquired. Queue structure must lie on non-paged memory * or we cannot guarantee access to the mutex. */ static void qp_init_queue_mutex(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { /* * Only the host queue has shared state - the guest queues do not * need to synchronize access using a queue mutex. */ if (produce_q->kernel_if->host) { produce_q->kernel_if->mutex = &produce_q->kernel_if->__mutex; consume_q->kernel_if->mutex = &produce_q->kernel_if->__mutex; mutex_init(produce_q->kernel_if->mutex); } } /* * Cleans up the mutex for the pair of queues. */ static void qp_cleanup_queue_mutex(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { if (produce_q->kernel_if->host) { produce_q->kernel_if->mutex = NULL; consume_q->kernel_if->mutex = NULL; } } /* * Acquire the mutex for the queue. Note that the produce_q and * the consume_q share a mutex. So, only one of the two need to * be passed in to this routine. Either will work just fine. */ static void qp_acquire_queue_mutex(struct vmci_queue *queue) { if (queue->kernel_if->host) mutex_lock(queue->kernel_if->mutex); } /* * Release the mutex for the queue. Note that the produce_q and * the consume_q share a mutex. So, only one of the two need to * be passed in to this routine. Either will work just fine. */ static void qp_release_queue_mutex(struct vmci_queue *queue) { if (queue->kernel_if->host) mutex_unlock(queue->kernel_if->mutex); } /* * Helper function to release pages in the PageStoreAttachInfo * previously obtained using get_user_pages. */ static void qp_release_pages(struct page **pages, u64 num_pages, bool dirty) { int i; for (i = 0; i < num_pages; i++) { if (dirty) set_page_dirty_lock(pages[i]); put_page(pages[i]); pages[i] = NULL; } } /* * Lock the user pages referenced by the {produce,consume}Buffer * struct into memory and populate the {produce,consume}Pages * arrays in the attach structure with them. */ static int qp_host_get_user_memory(u64 produce_uva, u64 consume_uva, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int retval; int err = VMCI_SUCCESS; retval = get_user_pages_fast((uintptr_t) produce_uva, produce_q->kernel_if->num_pages, FOLL_WRITE, produce_q->kernel_if->u.h.header_page); if (retval < (int)produce_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(produce) failed (retval=%d)", retval); if (retval > 0) qp_release_pages(produce_q->kernel_if->u.h.header_page, retval, false); err = VMCI_ERROR_NO_MEM; goto out; } retval = get_user_pages_fast((uintptr_t) consume_uva, consume_q->kernel_if->num_pages, FOLL_WRITE, consume_q->kernel_if->u.h.header_page); if (retval < (int)consume_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(consume) failed (retval=%d)", retval); if (retval > 0) qp_release_pages(consume_q->kernel_if->u.h.header_page, retval, false); qp_release_pages(produce_q->kernel_if->u.h.header_page, produce_q->kernel_if->num_pages, false); err = VMCI_ERROR_NO_MEM; } out: return err; } /* * Registers the specification of the user pages used for backing a queue * pair. Enough information to map in pages is stored in the OS specific * part of the struct vmci_queue structure. */ static int qp_host_register_user_memory(struct vmci_qp_page_store *page_store, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { u64 produce_uva; u64 consume_uva; /* * The new style and the old style mapping only differs in * that we either get a single or two UVAs, so we split the * single UVA range at the appropriate spot. */ produce_uva = page_store->pages; consume_uva = page_store->pages + produce_q->kernel_if->num_pages * PAGE_SIZE; return qp_host_get_user_memory(produce_uva, consume_uva, produce_q, consume_q); } /* * Releases and removes the references to user pages stored in the attach * struct. Pages are released from the page cache and may become * swappable again. */ static void qp_host_unregister_user_memory(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { qp_release_pages(produce_q->kernel_if->u.h.header_page, produce_q->kernel_if->num_pages, true); memset(produce_q->kernel_if->u.h.header_page, 0, sizeof(*produce_q->kernel_if->u.h.header_page) * produce_q->kernel_if->num_pages); qp_release_pages(consume_q->kernel_if->u.h.header_page, consume_q->kernel_if->num_pages, true); memset(consume_q->kernel_if->u.h.header_page, 0, sizeof(*consume_q->kernel_if->u.h.header_page) * consume_q->kernel_if->num_pages); } /* * Once qp_host_register_user_memory has been performed on a * queue, the queue pair headers can be mapped into the * kernel. Once mapped, they must be unmapped with * qp_host_unmap_queues prior to calling * qp_host_unregister_user_memory. * Pages are pinned. */ static int qp_host_map_queues(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int result; if (!produce_q->q_header || !consume_q->q_header) { struct page *headers[2]; if (produce_q->q_header != consume_q->q_header) return VMCI_ERROR_QUEUEPAIR_MISMATCH; if (produce_q->kernel_if->u.h.header_page == NULL || *produce_q->kernel_if->u.h.header_page == NULL) return VMCI_ERROR_UNAVAILABLE; headers[0] = *produce_q->kernel_if->u.h.header_page; headers[1] = *consume_q->kernel_if->u.h.header_page; produce_q->q_header = vmap(headers, 2, VM_MAP, PAGE_KERNEL); if (produce_q->q_header != NULL) { consume_q->q_header = (struct vmci_queue_header *)((u8 *) produce_q->q_header + PAGE_SIZE); result = VMCI_SUCCESS; } else { pr_warn("vmap failed\n"); result = VMCI_ERROR_NO_MEM; } } else { result = VMCI_SUCCESS; } return result; } /* * Unmaps previously mapped queue pair headers from the kernel. * Pages are unpinned. */ static int qp_host_unmap_queues(u32 gid, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { if (produce_q->q_header) { if (produce_q->q_header < consume_q->q_header) vunmap(produce_q->q_header); else vunmap(consume_q->q_header); produce_q->q_header = NULL; consume_q->q_header = NULL; } return VMCI_SUCCESS; } /* * Finds the entry in the list corresponding to a given handle. Assumes * that the list is locked. */ static struct qp_entry *qp_list_find(struct qp_list *qp_list, struct vmci_handle handle) { struct qp_entry *entry; if (vmci_handle_is_invalid(handle)) return NULL; list_for_each_entry(entry, &qp_list->head, list_item) { if (vmci_handle_is_equal(entry->handle, handle)) return entry; } return NULL; } /* * Finds the entry in the list corresponding to a given handle. */ static struct qp_guest_endpoint * qp_guest_handle_to_entry(struct vmci_handle handle) { struct qp_guest_endpoint *entry; struct qp_entry *qp = qp_list_find(&qp_guest_endpoints, handle); entry = qp ? container_of( qp, struct qp_guest_endpoint, qp) : NULL; return entry; } /* * Finds the entry in the list corresponding to a given handle. */ static struct qp_broker_entry * qp_broker_handle_to_entry(struct vmci_handle handle) { struct qp_broker_entry *entry; struct qp_entry *qp = qp_list_find(&qp_broker_list, handle); entry = qp ? container_of( qp, struct qp_broker_entry, qp) : NULL; return entry; } /* * Dispatches a queue pair event message directly into the local event * queue. */ static int qp_notify_peer_local(bool attach, struct vmci_handle handle) { u32 context_id = vmci_get_context_id(); struct vmci_event_qp ev; memset(&ev, 0, sizeof(ev)); ev.msg.hdr.dst = vmci_make_handle(context_id, VMCI_EVENT_HANDLER); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); ev.msg.event_data.event = attach ? VMCI_EVENT_QP_PEER_ATTACH : VMCI_EVENT_QP_PEER_DETACH; ev.payload.peer_id = context_id; ev.payload.handle = handle; return vmci_event_dispatch(&ev.msg.hdr); } /* * Allocates and initializes a qp_guest_endpoint structure. * Allocates a queue_pair rid (and handle) iff the given entry has * an invalid handle. 0 through VMCI_RESERVED_RESOURCE_ID_MAX * are reserved handles. Assumes that the QP list mutex is held * by the caller. */ static struct qp_guest_endpoint * qp_guest_endpoint_create(struct vmci_handle handle, u32 peer, u32 flags, u64 produce_size, u64 consume_size, void *produce_q, void *consume_q) { int result; struct qp_guest_endpoint *entry; /* One page each for the queue headers. */ const u64 num_ppns = DIV_ROUND_UP(produce_size, PAGE_SIZE) + DIV_ROUND_UP(consume_size, PAGE_SIZE) + 2; if (vmci_handle_is_invalid(handle)) { u32 context_id = vmci_get_context_id(); handle = vmci_make_handle(context_id, VMCI_INVALID_ID); } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry) { entry->qp.peer = peer; entry->qp.flags = flags; entry->qp.produce_size = produce_size; entry->qp.consume_size = consume_size; entry->qp.ref_count = 0; entry->num_ppns = num_ppns; entry->produce_q = produce_q; entry->consume_q = consume_q; INIT_LIST_HEAD(&entry->qp.list_item); /* Add resource obj */ result = vmci_resource_add(&entry->resource, VMCI_RESOURCE_TYPE_QPAIR_GUEST, handle); entry->qp.handle = vmci_resource_handle(&entry->resource); if ((result != VMCI_SUCCESS) || qp_list_find(&qp_guest_endpoints, entry->qp.handle)) { pr_warn("Failed to add new resource (handle=0x%x:0x%x), error: %d", handle.context, handle.resource, result); kfree(entry); entry = NULL; } } return entry; } /* * Frees a qp_guest_endpoint structure. */ static void qp_guest_endpoint_destroy(struct qp_guest_endpoint *entry) { qp_free_ppn_set(&entry->ppn_set); qp_cleanup_queue_mutex(entry->produce_q, entry->consume_q); qp_free_queue(entry->produce_q, entry->qp.produce_size); qp_free_queue(entry->consume_q, entry->qp.consume_size); /* Unlink from resource hash table and free callback */ vmci_resource_remove(&entry->resource); kfree(entry); } /* * Helper to make a queue_pairAlloc hypercall when the driver is * supporting a guest device. */ static int qp_alloc_hypercall(const struct qp_guest_endpoint *entry) { struct vmci_qp_alloc_msg *alloc_msg; size_t msg_size; size_t ppn_size; int result; if (!entry || entry->num_ppns <= 2) return VMCI_ERROR_INVALID_ARGS; ppn_size = vmci_use_ppn64() ? sizeof(u64) : sizeof(u32); msg_size = sizeof(*alloc_msg) + (size_t) entry->num_ppns * ppn_size; alloc_msg = kmalloc(msg_size, GFP_KERNEL); if (!alloc_msg) return VMCI_ERROR_NO_MEM; alloc_msg->hdr.dst = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_QUEUEPAIR_ALLOC); alloc_msg->hdr.src = VMCI_ANON_SRC_HANDLE; alloc_msg->hdr.payload_size = msg_size - VMCI_DG_HEADERSIZE; alloc_msg->handle = entry->qp.handle; alloc_msg->peer = entry->qp.peer; alloc_msg->flags = entry->qp.flags; alloc_msg->produce_size = entry->qp.produce_size; alloc_msg->consume_size = entry->qp.consume_size; alloc_msg->num_ppns = entry->num_ppns; result = qp_populate_ppn_set((u8 *)alloc_msg + sizeof(*alloc_msg), &entry->ppn_set); if (result == VMCI_SUCCESS) result = vmci_send_datagram(&alloc_msg->hdr); kfree(alloc_msg); return result; } /* * Helper to make a queue_pairDetach hypercall when the driver is * supporting a guest device. */ static int qp_detatch_hypercall(struct vmci_handle handle) { struct vmci_qp_detach_msg detach_msg; detach_msg.hdr.dst = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_QUEUEPAIR_DETACH); detach_msg.hdr.src = VMCI_ANON_SRC_HANDLE; detach_msg.hdr.payload_size = sizeof(handle); detach_msg.handle = handle; return vmci_send_datagram(&detach_msg.hdr); } /* * Adds the given entry to the list. Assumes that the list is locked. */ static void qp_list_add_entry(struct qp_list *qp_list, struct qp_entry *entry) { if (entry) list_add(&entry->list_item, &qp_list->head); } /* * Removes the given entry from the list. Assumes that the list is locked. */ static void qp_list_remove_entry(struct qp_list *qp_list, struct qp_entry *entry) { if (entry) list_del(&entry->list_item); } /* * Helper for VMCI queue_pair detach interface. Frees the physical * pages for the queue pair. */ static int qp_detatch_guest_work(struct vmci_handle handle) { int result; struct qp_guest_endpoint *entry; u32 ref_count = ~0; /* To avoid compiler warning below */ mutex_lock(&qp_guest_endpoints.mutex); entry = qp_guest_handle_to_entry(handle); if (!entry) { mutex_unlock(&qp_guest_endpoints.mutex); return VMCI_ERROR_NOT_FOUND; } if (entry->qp.flags & VMCI_QPFLAG_LOCAL) { result = VMCI_SUCCESS; if (entry->qp.ref_count > 1) { result = qp_notify_peer_local(false, handle); /* * We can fail to notify a local queuepair * because we can't allocate. We still want * to release the entry if that happens, so * don't bail out yet. */ } } else { result = qp_detatch_hypercall(handle); if (result < VMCI_SUCCESS) { /* * We failed to notify a non-local queuepair. * That other queuepair might still be * accessing the shared memory, so don't * release the entry yet. It will get cleaned * up by VMCIqueue_pair_Exit() if necessary * (assuming we are going away, otherwise why * did this fail?). */ mutex_unlock(&qp_guest_endpoints.mutex); return result; } } /* * If we get here then we either failed to notify a local queuepair, or * we succeeded in all cases. Release the entry if required. */ entry->qp.ref_count--; if (entry->qp.ref_count == 0) qp_list_remove_entry(&qp_guest_endpoints, &entry->qp); /* If we didn't remove the entry, this could change once we unlock. */ if (entry) ref_count = entry->qp.ref_count; mutex_unlock(&qp_guest_endpoints.mutex); if (ref_count == 0) qp_guest_endpoint_destroy(entry); return result; } /* * This functions handles the actual allocation of a VMCI queue * pair guest endpoint. Allocates physical pages for the queue * pair. It makes OS dependent calls through generic wrappers. */ static int qp_alloc_guest_work(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags) { const u64 num_produce_pages = DIV_ROUND_UP(produce_size, PAGE_SIZE) + 1; const u64 num_consume_pages = DIV_ROUND_UP(consume_size, PAGE_SIZE) + 1; void *my_produce_q = NULL; void *my_consume_q = NULL; int result; struct qp_guest_endpoint *queue_pair_entry = NULL; if (priv_flags != VMCI_NO_PRIVILEGE_FLAGS) return VMCI_ERROR_NO_ACCESS; mutex_lock(&qp_guest_endpoints.mutex); queue_pair_entry = qp_guest_handle_to_entry(*handle); if (queue_pair_entry) { if (queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) { /* Local attach case. */ if (queue_pair_entry->qp.ref_count > 1) { pr_devel("Error attempting to attach more than once\n"); result = VMCI_ERROR_UNAVAILABLE; goto error_keep_entry; } if (queue_pair_entry->qp.produce_size != consume_size || queue_pair_entry->qp.consume_size != produce_size || queue_pair_entry->qp.flags != (flags & ~VMCI_QPFLAG_ATTACH_ONLY)) { pr_devel("Error mismatched queue pair in local attach\n"); result = VMCI_ERROR_QUEUEPAIR_MISMATCH; goto error_keep_entry; } /* * Do a local attach. We swap the consume and * produce queues for the attacher and deliver * an attach event. */ result = qp_notify_peer_local(true, *handle); if (result < VMCI_SUCCESS) goto error_keep_entry; my_produce_q = queue_pair_entry->consume_q; my_consume_q = queue_pair_entry->produce_q; goto out; } result = VMCI_ERROR_ALREADY_EXISTS; goto error_keep_entry; } my_produce_q = qp_alloc_queue(produce_size, flags); if (!my_produce_q) { pr_warn("Error allocating pages for produce queue\n"); result = VMCI_ERROR_NO_MEM; goto error; } my_consume_q = qp_alloc_queue(consume_size, flags); if (!my_consume_q) { pr_warn("Error allocating pages for consume queue\n"); result = VMCI_ERROR_NO_MEM; goto error; } queue_pair_entry = qp_guest_endpoint_create(*handle, peer, flags, produce_size, consume_size, my_produce_q, my_consume_q); if (!queue_pair_entry) { pr_warn("Error allocating memory in %s\n", __func__); result = VMCI_ERROR_NO_MEM; goto error; } result = qp_alloc_ppn_set(my_produce_q, num_produce_pages, my_consume_q, num_consume_pages, &queue_pair_entry->ppn_set); if (result < VMCI_SUCCESS) { pr_warn("qp_alloc_ppn_set failed\n"); goto error; } /* * It's only necessary to notify the host if this queue pair will be * attached to from another context. */ if (queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) { /* Local create case. */ u32 context_id = vmci_get_context_id(); /* * Enforce similar checks on local queue pairs as we * do for regular ones. The handle's context must * match the creator or attacher context id (here they * are both the current context id) and the * attach-only flag cannot exist during create. We * also ensure specified peer is this context or an * invalid one. */ if (queue_pair_entry->qp.handle.context != context_id || (queue_pair_entry->qp.peer != VMCI_INVALID_ID && queue_pair_entry->qp.peer != context_id)) { result = VMCI_ERROR_NO_ACCESS; goto error; } if (queue_pair_entry->qp.flags & VMCI_QPFLAG_ATTACH_ONLY) { result = VMCI_ERROR_NOT_FOUND; goto error; } } else { result = qp_alloc_hypercall(queue_pair_entry); if (result < VMCI_SUCCESS) { pr_devel("qp_alloc_hypercall result = %d\n", result); goto error; } } qp_init_queue_mutex((struct vmci_queue *)my_produce_q, (struct vmci_queue *)my_consume_q); qp_list_add_entry(&qp_guest_endpoints, &queue_pair_entry->qp); out: queue_pair_entry->qp.ref_count++; *handle = queue_pair_entry->qp.handle; *produce_q = (struct vmci_queue *)my_produce_q; *consume_q = (struct vmci_queue *)my_consume_q; /* * We should initialize the queue pair header pages on a local * queue pair create. For non-local queue pairs, the * hypervisor initializes the header pages in the create step. */ if ((queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) && queue_pair_entry->qp.ref_count == 1) { vmci_q_header_init((*produce_q)->q_header, *handle); vmci_q_header_init((*consume_q)->q_header, *handle); } mutex_unlock(&qp_guest_endpoints.mutex); return VMCI_SUCCESS; error: mutex_unlock(&qp_guest_endpoints.mutex); if (queue_pair_entry) { /* The queues will be freed inside the destroy routine. */ qp_guest_endpoint_destroy(queue_pair_entry); } else { qp_free_queue(my_produce_q, produce_size); qp_free_queue(my_consume_q, consume_size); } return result; error_keep_entry: /* This path should only be used when an existing entry was found. */ mutex_unlock(&qp_guest_endpoints.mutex); return result; } /* * The first endpoint issuing a queue pair allocation will create the state * of the queue pair in the queue pair broker. * * If the creator is a guest, it will associate a VMX virtual address range * with the queue pair as specified by the page_store. For compatibility with * older VMX'en, that would use a separate step to set the VMX virtual * address range, the virtual address range can be registered later using * vmci_qp_broker_set_page_store. In that case, a page_store of NULL should be * used. * * If the creator is the host, a page_store of NULL should be used as well, * since the host is not able to supply a page store for the queue pair. * * For older VMX and host callers, the queue pair will be created in the * VMCIQPB_CREATED_NO_MEM state, and for current VMX callers, it will be * created in VMCOQPB_CREATED_MEM state. */ static int qp_broker_create(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent) { struct qp_broker_entry *entry = NULL; const u32 context_id = vmci_ctx_get_id(context); bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; u64 guest_produce_size; u64 guest_consume_size; /* Do not create if the caller asked not to. */ if (flags & VMCI_QPFLAG_ATTACH_ONLY) return VMCI_ERROR_NOT_FOUND; /* * Creator's context ID should match handle's context ID or the creator * must allow the context in handle's context ID as the "peer". */ if (handle.context != context_id && handle.context != peer) return VMCI_ERROR_NO_ACCESS; if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(peer)) return VMCI_ERROR_DST_UNREACHABLE; /* * Creator's context ID for local queue pairs should match the * peer, if a peer is specified. */ if (is_local && peer != VMCI_INVALID_ID && context_id != peer) return VMCI_ERROR_NO_ACCESS; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return VMCI_ERROR_NO_MEM; if (vmci_ctx_get_id(context) == VMCI_HOST_CONTEXT_ID && !is_local) { /* * The queue pair broker entry stores values from the guest * point of view, so a creating host side endpoint should swap * produce and consume values -- unless it is a local queue * pair, in which case no swapping is necessary, since the local * attacher will swap queues. */ guest_produce_size = consume_size; guest_consume_size = produce_size; } else { guest_produce_size = produce_size; guest_consume_size = consume_size; } entry->qp.handle = handle; entry->qp.peer = peer; entry->qp.flags = flags; entry->qp.produce_size = guest_produce_size; entry->qp.consume_size = guest_consume_size; entry->qp.ref_count = 1; entry->create_id = context_id; entry->attach_id = VMCI_INVALID_ID; entry->state = VMCIQPB_NEW; entry->require_trusted_attach = !!(context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED); entry->created_by_trusted = !!(priv_flags & VMCI_PRIVILEGE_FLAG_TRUSTED); entry->vmci_page_files = false; entry->wakeup_cb = wakeup_cb; entry->client_data = client_data; entry->produce_q = qp_host_alloc_queue(guest_produce_size); if (entry->produce_q == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } entry->consume_q = qp_host_alloc_queue(guest_consume_size); if (entry->consume_q == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } qp_init_queue_mutex(entry->produce_q, entry->consume_q); INIT_LIST_HEAD(&entry->qp.list_item); if (is_local) { u8 *tmp; entry->local_mem = kcalloc(QPE_NUM_PAGES(entry->qp), PAGE_SIZE, GFP_KERNEL); if (entry->local_mem == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } entry->state = VMCIQPB_CREATED_MEM; entry->produce_q->q_header = entry->local_mem; tmp = (u8 *)entry->local_mem + PAGE_SIZE * (DIV_ROUND_UP(entry->qp.produce_size, PAGE_SIZE) + 1); entry->consume_q->q_header = (struct vmci_queue_header *)tmp; } else if (page_store) { /* * The VMX already initialized the queue pair headers, so no * need for the kernel side to do that. */ result = qp_host_register_user_memory(page_store, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) goto error; entry->state = VMCIQPB_CREATED_MEM; } else { /* * A create without a page_store may be either a host * side create (in which case we are waiting for the * guest side to supply the memory) or an old style * queue pair create (in which case we will expect a * set page store call as the next step). */ entry->state = VMCIQPB_CREATED_NO_MEM; } qp_list_add_entry(&qp_broker_list, &entry->qp); if (ent != NULL) *ent = entry; /* Add to resource obj */ result = vmci_resource_add(&entry->resource, VMCI_RESOURCE_TYPE_QPAIR_HOST, handle); if (result != VMCI_SUCCESS) { pr_warn("Failed to add new resource (handle=0x%x:0x%x), error: %d", handle.context, handle.resource, result); goto error; } entry->qp.handle = vmci_resource_handle(&entry->resource); if (is_local) { vmci_q_header_init(entry->produce_q->q_header, entry->qp.handle); vmci_q_header_init(entry->consume_q->q_header, entry->qp.handle); } vmci_ctx_qp_create(context, entry->qp.handle); return VMCI_SUCCESS; error: if (entry != NULL) { qp_host_free_queue(entry->produce_q, guest_produce_size); qp_host_free_queue(entry->consume_q, guest_consume_size); kfree(entry); } return result; } /* * Enqueues an event datagram to notify the peer VM attached to * the given queue pair handle about attach/detach event by the * given VM. Returns Payload size of datagram enqueued on * success, error code otherwise. */ static int qp_notify_peer(bool attach, struct vmci_handle handle, u32 my_id, u32 peer_id) { int rv; struct vmci_event_qp ev; if (vmci_handle_is_invalid(handle) || my_id == VMCI_INVALID_ID || peer_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; /* * In vmci_ctx_enqueue_datagram() we enforce the upper limit on * number of pending events from the hypervisor to a given VM * otherwise a rogue VM could do an arbitrary number of attach * and detach operations causing memory pressure in the host * kernel. */ memset(&ev, 0, sizeof(ev)); ev.msg.hdr.dst = vmci_make_handle(peer_id, VMCI_EVENT_HANDLER); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); ev.msg.event_data.event = attach ? VMCI_EVENT_QP_PEER_ATTACH : VMCI_EVENT_QP_PEER_DETACH; ev.payload.handle = handle; ev.payload.peer_id = my_id; rv = vmci_datagram_dispatch(VMCI_HYPERVISOR_CONTEXT_ID, &ev.msg.hdr, false); if (rv < VMCI_SUCCESS) pr_warn("Failed to enqueue queue_pair %s event datagram for context (ID=0x%x)\n", attach ? "ATTACH" : "DETACH", peer_id); return rv; } /* * The second endpoint issuing a queue pair allocation will attach to * the queue pair registered with the queue pair broker. * * If the attacher is a guest, it will associate a VMX virtual address * range with the queue pair as specified by the page_store. At this * point, the already attach host endpoint may start using the queue * pair, and an attach event is sent to it. For compatibility with * older VMX'en, that used a separate step to set the VMX virtual * address range, the virtual address range can be registered later * using vmci_qp_broker_set_page_store. In that case, a page_store of * NULL should be used, and the attach event will be generated once * the actual page store has been set. * * If the attacher is the host, a page_store of NULL should be used as * well, since the page store information is already set by the guest. * * For new VMX and host callers, the queue pair will be moved to the * VMCIQPB_ATTACHED_MEM state, and for older VMX callers, it will be * moved to the VMCOQPB_ATTACHED_NO_MEM state. */ static int qp_broker_attach(struct qp_broker_entry *entry, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent) { const u32 context_id = vmci_ctx_get_id(context); bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; if (entry->state != VMCIQPB_CREATED_NO_MEM && entry->state != VMCIQPB_CREATED_MEM) return VMCI_ERROR_UNAVAILABLE; if (is_local) { if (!(entry->qp.flags & VMCI_QPFLAG_LOCAL) || context_id != entry->create_id) { return VMCI_ERROR_INVALID_ARGS; } } else if (context_id == entry->create_id || context_id == entry->attach_id) { return VMCI_ERROR_ALREADY_EXISTS; } if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(entry->create_id)) return VMCI_ERROR_DST_UNREACHABLE; /* * If we are attaching from a restricted context then the queuepair * must have been created by a trusted endpoint. */ if ((context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED) && !entry->created_by_trusted) return VMCI_ERROR_NO_ACCESS; /* * If we are attaching to a queuepair that was created by a restricted * context then we must be trusted. */ if (entry->require_trusted_attach && (!(priv_flags & VMCI_PRIVILEGE_FLAG_TRUSTED))) return VMCI_ERROR_NO_ACCESS; /* * If the creator specifies VMCI_INVALID_ID in "peer" field, access * control check is not performed. */ if (entry->qp.peer != VMCI_INVALID_ID && entry->qp.peer != context_id) return VMCI_ERROR_NO_ACCESS; if (entry->create_id == VMCI_HOST_CONTEXT_ID) { /* * Do not attach if the caller doesn't support Host Queue Pairs * and a host created this queue pair. */ if (!vmci_ctx_supports_host_qp(context)) return VMCI_ERROR_INVALID_RESOURCE; } else if (context_id == VMCI_HOST_CONTEXT_ID) { struct vmci_ctx *create_context; bool supports_host_qp; /* * Do not attach a host to a user created queue pair if that * user doesn't support host queue pair end points. */ create_context = vmci_ctx_get(entry->create_id); supports_host_qp = vmci_ctx_supports_host_qp(create_context); vmci_ctx_put(create_context); if (!supports_host_qp) return VMCI_ERROR_INVALID_RESOURCE; } if ((entry->qp.flags & ~VMCI_QP_ASYMM) != (flags & ~VMCI_QP_ASYMM_PEER)) return VMCI_ERROR_QUEUEPAIR_MISMATCH; if (context_id != VMCI_HOST_CONTEXT_ID) { /* * The queue pair broker entry stores values from the guest * point of view, so an attaching guest should match the values * stored in the entry. */ if (entry->qp.produce_size != produce_size || entry->qp.consume_size != consume_size) { return VMCI_ERROR_QUEUEPAIR_MISMATCH; } } else if (entry->qp.produce_size != consume_size || entry->qp.consume_size != produce_size) { return VMCI_ERROR_QUEUEPAIR_MISMATCH; } if (context_id != VMCI_HOST_CONTEXT_ID) { /* * If a guest attached to a queue pair, it will supply * the backing memory. If this is a pre NOVMVM vmx, * the backing memory will be supplied by calling * vmci_qp_broker_set_page_store() following the * return of the vmci_qp_broker_alloc() call. If it is * a vmx of version NOVMVM or later, the page store * must be supplied as part of the * vmci_qp_broker_alloc call. Under all circumstances * must the initially created queue pair not have any * memory associated with it already. */ if (entry->state != VMCIQPB_CREATED_NO_MEM) return VMCI_ERROR_INVALID_ARGS; if (page_store != NULL) { /* * Patch up host state to point to guest * supplied memory. The VMX already * initialized the queue pair headers, so no * need for the kernel side to do that. */ result = qp_host_register_user_memory(page_store, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) return result; entry->state = VMCIQPB_ATTACHED_MEM; } else { entry->state = VMCIQPB_ATTACHED_NO_MEM; } } else if (entry->state == VMCIQPB_CREATED_NO_MEM) { /* * The host side is attempting to attach to a queue * pair that doesn't have any memory associated with * it. This must be a pre NOVMVM vmx that hasn't set * the page store information yet, or a quiesced VM. */ return VMCI_ERROR_UNAVAILABLE; } else { /* The host side has successfully attached to a queue pair. */ entry->state = VMCIQPB_ATTACHED_MEM; } if (entry->state == VMCIQPB_ATTACHED_MEM) { result = qp_notify_peer(true, entry->qp.handle, context_id, entry->create_id); if (result < VMCI_SUCCESS) pr_warn("Failed to notify peer (ID=0x%x) of attach to queue pair (handle=0x%x:0x%x)\n", entry->create_id, entry->qp.handle.context, entry->qp.handle.resource); } entry->attach_id = context_id; entry->qp.ref_count++; if (wakeup_cb) { entry->wakeup_cb = wakeup_cb; entry->client_data = client_data; } /* * When attaching to local queue pairs, the context already has * an entry tracking the queue pair, so don't add another one. */ if (!is_local) vmci_ctx_qp_create(context, entry->qp.handle); if (ent != NULL) *ent = entry; return VMCI_SUCCESS; } /* * queue_pair_Alloc for use when setting up queue pair endpoints * on the host. */ static int qp_broker_alloc(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent, bool *swap) { const u32 context_id = vmci_ctx_get_id(context); bool create; struct qp_broker_entry *entry = NULL; bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; if (vmci_handle_is_invalid(handle) || (flags & ~VMCI_QP_ALL_FLAGS) || is_local || !(produce_size || consume_size) || !context || context_id == VMCI_INVALID_ID || handle.context == VMCI_INVALID_ID) { return VMCI_ERROR_INVALID_ARGS; } if (page_store && !VMCI_QP_PAGESTORE_IS_WELLFORMED(page_store)) return VMCI_ERROR_INVALID_ARGS; /* * In the initial argument check, we ensure that non-vmkernel hosts * are not allowed to create local queue pairs. */ mutex_lock(&qp_broker_list.mutex); if (!is_local && vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) already attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); mutex_unlock(&qp_broker_list.mutex); return VMCI_ERROR_ALREADY_EXISTS; } if (handle.resource != VMCI_INVALID_ID) entry = qp_broker_handle_to_entry(handle); if (!entry) { create = true; result = qp_broker_create(handle, peer, flags, priv_flags, produce_size, consume_size, page_store, context, wakeup_cb, client_data, ent); } else { create = false; result = qp_broker_attach(entry, peer, flags, priv_flags, produce_size, consume_size, page_store, context, wakeup_cb, client_data, ent); } mutex_unlock(&qp_broker_list.mutex); if (swap) *swap = (context_id == VMCI_HOST_CONTEXT_ID) && !(create && is_local); return result; } /* * This function implements the kernel API for allocating a queue * pair. */ static int qp_alloc_host_work(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags, vmci_event_release_cb wakeup_cb, void *client_data) { struct vmci_handle new_handle; struct vmci_ctx *context; struct qp_broker_entry *entry; int result; bool swap; if (vmci_handle_is_invalid(*handle)) { new_handle = vmci_make_handle( VMCI_HOST_CONTEXT_ID, VMCI_INVALID_ID); } else new_handle = *handle; context = vmci_ctx_get(VMCI_HOST_CONTEXT_ID); entry = NULL; result = qp_broker_alloc(new_handle, peer, flags, priv_flags, produce_size, consume_size, NULL, context, wakeup_cb, client_data, &entry, &swap); if (result == VMCI_SUCCESS) { if (swap) { /* * If this is a local queue pair, the attacher * will swap around produce and consume * queues. */ *produce_q = entry->consume_q; *consume_q = entry->produce_q; } else { *produce_q = entry->produce_q; *consume_q = entry->consume_q; } *handle = vmci_resource_handle(&entry->resource); } else { *handle = VMCI_INVALID_HANDLE; pr_devel("queue pair broker failed to alloc (result=%d)\n", result); } vmci_ctx_put(context); return result; } /* * Allocates a VMCI queue_pair. Only checks validity of input * arguments. The real work is done in the host or guest * specific function. */ int vmci_qp_alloc(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags, bool guest_endpoint, vmci_event_release_cb wakeup_cb, void *client_data) { if (!handle || !produce_q || !consume_q || (!produce_size && !consume_size) || (flags & ~VMCI_QP_ALL_FLAGS)) return VMCI_ERROR_INVALID_ARGS; if (guest_endpoint) { return qp_alloc_guest_work(handle, produce_q, produce_size, consume_q, consume_size, peer, flags, priv_flags); } else { return qp_alloc_host_work(handle, produce_q, produce_size, consume_q, consume_size, peer, flags, priv_flags, wakeup_cb, client_data); } } /* * This function implements the host kernel API for detaching from * a queue pair. */ static int qp_detatch_host_work(struct vmci_handle handle) { int result; struct vmci_ctx *context; context = vmci_ctx_get(VMCI_HOST_CONTEXT_ID); result = vmci_qp_broker_detach(handle, context); vmci_ctx_put(context); return result; } /* * Detaches from a VMCI queue_pair. Only checks validity of input argument. * Real work is done in the host or guest specific function. */ static int qp_detatch(struct vmci_handle handle, bool guest_endpoint) { if (vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; if (guest_endpoint) return qp_detatch_guest_work(handle); else return qp_detatch_host_work(handle); } /* * Returns the entry from the head of the list. Assumes that the list is * locked. */ static struct qp_entry *qp_list_get_head(struct qp_list *qp_list) { if (!list_empty(&qp_list->head)) { struct qp_entry *entry = list_first_entry(&qp_list->head, struct qp_entry, list_item); return entry; } return NULL; } void vmci_qp_broker_exit(void) { struct qp_entry *entry; struct qp_broker_entry *be; mutex_lock(&qp_broker_list.mutex); while ((entry = qp_list_get_head(&qp_broker_list))) { be = (struct qp_broker_entry *)entry; qp_list_remove_entry(&qp_broker_list, entry); kfree(be); } mutex_unlock(&qp_broker_list.mutex); } /* * Requests that a queue pair be allocated with the VMCI queue * pair broker. Allocates a queue pair entry if one does not * exist. Attaches to one if it exists, and retrieves the page * files backing that queue_pair. Assumes that the queue pair * broker lock is held. */ int vmci_qp_broker_alloc(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context) { if (!QP_SIZES_ARE_VALID(produce_size, consume_size)) return VMCI_ERROR_NO_RESOURCES; return qp_broker_alloc(handle, peer, flags, priv_flags, produce_size, consume_size, page_store, context, NULL, NULL, NULL, NULL); } /* * VMX'en with versions lower than VMCI_VERSION_NOVMVM use a separate * step to add the UVAs of the VMX mapping of the queue pair. This function * provides backwards compatibility with such VMX'en, and takes care of * registering the page store for a queue pair previously allocated by the * VMX during create or attach. This function will move the queue pair state * to either from VMCIQBP_CREATED_NO_MEM to VMCIQBP_CREATED_MEM or * VMCIQBP_ATTACHED_NO_MEM to VMCIQBP_ATTACHED_MEM. If moving to the * attached state with memory, the queue pair is ready to be used by the * host peer, and an attached event will be generated. * * Assumes that the queue pair broker lock is held. * * This function is only used by the hosted platform, since there is no * issue with backwards compatibility for vmkernel. */ int vmci_qp_broker_set_page_store(struct vmci_handle handle, u64 produce_uva, u64 consume_uva, struct vmci_ctx *context) { struct qp_broker_entry *entry; int result; const u32 context_id = vmci_ctx_get_id(context); if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; /* * We only support guest to host queue pairs, so the VMX must * supply UVAs for the mapped page files. */ if (produce_uva == 0 || consume_uva == 0) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_warn("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { result = VMCI_ERROR_NOT_FOUND; goto out; } /* * If I'm the owner then I can set the page store. * * Or, if a host created the queue_pair and I'm the attached peer * then I can set the page store. */ if (entry->create_id != context_id && (entry->create_id != VMCI_HOST_CONTEXT_ID || entry->attach_id != context_id)) { result = VMCI_ERROR_QUEUEPAIR_NOTOWNER; goto out; } if (entry->state != VMCIQPB_CREATED_NO_MEM && entry->state != VMCIQPB_ATTACHED_NO_MEM) { result = VMCI_ERROR_UNAVAILABLE; goto out; } result = qp_host_get_user_memory(produce_uva, consume_uva, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) goto out; result = qp_host_map_queues(entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) { qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); goto out; } if (entry->state == VMCIQPB_CREATED_NO_MEM) entry->state = VMCIQPB_CREATED_MEM; else entry->state = VMCIQPB_ATTACHED_MEM; entry->vmci_page_files = true; if (entry->state == VMCIQPB_ATTACHED_MEM) { result = qp_notify_peer(true, handle, context_id, entry->create_id); if (result < VMCI_SUCCESS) { pr_warn("Failed to notify peer (ID=0x%x) of attach to queue pair (handle=0x%x:0x%x)\n", entry->create_id, entry->qp.handle.context, entry->qp.handle.resource); } } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Resets saved queue headers for the given QP broker * entry. Should be used when guest memory becomes available * again, or the guest detaches. */ static void qp_reset_saved_headers(struct qp_broker_entry *entry) { entry->produce_q->saved_header = NULL; entry->consume_q->saved_header = NULL; } /* * The main entry point for detaching from a queue pair registered with the * queue pair broker. If more than one endpoint is attached to the queue * pair, the first endpoint will mainly decrement a reference count and * generate a notification to its peer. The last endpoint will clean up * the queue pair state registered with the broker. * * When a guest endpoint detaches, it will unmap and unregister the guest * memory backing the queue pair. If the host is still attached, it will * no longer be able to access the queue pair content. * * If the queue pair is already in a state where there is no memory * registered for the queue pair (any *_NO_MEM state), it will transition to * the VMCIQPB_SHUTDOWN_NO_MEM state. This will also happen, if a guest * endpoint is the first of two endpoints to detach. If the host endpoint is * the first out of two to detach, the queue pair will move to the * VMCIQPB_SHUTDOWN_MEM state. */ int vmci_qp_broker_detach(struct vmci_handle handle, struct vmci_ctx *context) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); u32 peer_id; bool is_local = false; int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) { return VMCI_ERROR_INVALID_ARGS; } mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair(handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } if (context_id == entry->create_id) { peer_id = entry->attach_id; entry->create_id = VMCI_INVALID_ID; } else { peer_id = entry->create_id; entry->attach_id = VMCI_INVALID_ID; } entry->qp.ref_count--; is_local = entry->qp.flags & VMCI_QPFLAG_LOCAL; if (context_id != VMCI_HOST_CONTEXT_ID) { bool headers_mapped; /* * Pre NOVMVM vmx'en may detach from a queue pair * before setting the page store, and in that case * there is no user memory to detach from. Also, more * recent VMX'en may detach from a queue pair in the * quiesced state. */ qp_acquire_queue_mutex(entry->produce_q); headers_mapped = entry->produce_q->q_header || entry->consume_q->q_header; if (QPBROKERSTATE_HAS_MEM(entry)) { result = qp_host_unmap_queues(INVALID_VMCI_GUEST_MEM_ID, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) pr_warn("Failed to unmap queue headers for queue pair (handle=0x%x:0x%x,result=%d)\n", handle.context, handle.resource, result); qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); } if (!headers_mapped) qp_reset_saved_headers(entry); qp_release_queue_mutex(entry->produce_q); if (!headers_mapped && entry->wakeup_cb) entry->wakeup_cb(entry->client_data); } else { if (entry->wakeup_cb) { entry->wakeup_cb = NULL; entry->client_data = NULL; } } if (entry->qp.ref_count == 0) { qp_list_remove_entry(&qp_broker_list, &entry->qp); if (is_local) kfree(entry->local_mem); qp_cleanup_queue_mutex(entry->produce_q, entry->consume_q); qp_host_free_queue(entry->produce_q, entry->qp.produce_size); qp_host_free_queue(entry->consume_q, entry->qp.consume_size); /* Unlink from resource hash table and free callback */ vmci_resource_remove(&entry->resource); kfree(entry); vmci_ctx_qp_destroy(context, handle); } else { qp_notify_peer(false, handle, context_id, peer_id); if (context_id == VMCI_HOST_CONTEXT_ID && QPBROKERSTATE_HAS_MEM(entry)) { entry->state = VMCIQPB_SHUTDOWN_MEM; } else { entry->state = VMCIQPB_SHUTDOWN_NO_MEM; } if (!is_local) vmci_ctx_qp_destroy(context, handle); } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Establishes the necessary mappings for a queue pair given a * reference to the queue pair guest memory. This is usually * called when a guest is unquiesced and the VMX is allowed to * map guest memory once again. */ int vmci_qp_broker_map(struct vmci_handle handle, struct vmci_ctx *context, u64 guest_mem) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair (handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } result = VMCI_SUCCESS; if (context_id != VMCI_HOST_CONTEXT_ID && !QPBROKERSTATE_HAS_MEM(entry)) { struct vmci_qp_page_store page_store; page_store.pages = guest_mem; page_store.len = QPE_NUM_PAGES(entry->qp); qp_acquire_queue_mutex(entry->produce_q); qp_reset_saved_headers(entry); result = qp_host_register_user_memory(&page_store, entry->produce_q, entry->consume_q); qp_release_queue_mutex(entry->produce_q); if (result == VMCI_SUCCESS) { /* Move state from *_NO_MEM to *_MEM */ entry->state++; if (entry->wakeup_cb) entry->wakeup_cb(entry->client_data); } } out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Saves a snapshot of the queue headers for the given QP broker * entry. Should be used when guest memory is unmapped. * Results: * VMCI_SUCCESS on success, appropriate error code if guest memory * can't be accessed.. */ static int qp_save_headers(struct qp_broker_entry *entry) { int result; if (entry->produce_q->saved_header != NULL && entry->consume_q->saved_header != NULL) { /* * If the headers have already been saved, we don't need to do * it again, and we don't want to map in the headers * unnecessarily. */ return VMCI_SUCCESS; } if (NULL == entry->produce_q->q_header || NULL == entry->consume_q->q_header) { result = qp_host_map_queues(entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) return result; } memcpy(&entry->saved_produce_q, entry->produce_q->q_header, sizeof(entry->saved_produce_q)); entry->produce_q->saved_header = &entry->saved_produce_q; memcpy(&entry->saved_consume_q, entry->consume_q->q_header, sizeof(entry->saved_consume_q)); entry->consume_q->saved_header = &entry->saved_consume_q; return VMCI_SUCCESS; } /* * Removes all references to the guest memory of a given queue pair, and * will move the queue pair from state *_MEM to *_NO_MEM. It is usually * called when a VM is being quiesced where access to guest memory should * avoided. */ int vmci_qp_broker_unmap(struct vmci_handle handle, struct vmci_ctx *context, u32 gid) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair (handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } if (context_id != VMCI_HOST_CONTEXT_ID && QPBROKERSTATE_HAS_MEM(entry)) { qp_acquire_queue_mutex(entry->produce_q); result = qp_save_headers(entry); if (result < VMCI_SUCCESS) pr_warn("Failed to save queue headers for queue pair (handle=0x%x:0x%x,result=%d)\n", handle.context, handle.resource, result); qp_host_unmap_queues(gid, entry->produce_q, entry->consume_q); /* * On hosted, when we unmap queue pairs, the VMX will also * unmap the guest memory, so we invalidate the previously * registered memory. If the queue pair is mapped again at a * later point in time, we will need to reregister the user * memory with a possibly new user VA. */ qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); /* * Move state from *_MEM to *_NO_MEM. */ entry->state--; qp_release_queue_mutex(entry->produce_q); } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Destroys all guest queue pair endpoints. If active guest queue * pairs still exist, hypercalls to attempt detach from these * queue pairs will be made. Any failure to detach is silently * ignored. */ void vmci_qp_guest_endpoints_exit(void) { struct qp_entry *entry; struct qp_guest_endpoint *ep; mutex_lock(&qp_guest_endpoints.mutex); while ((entry = qp_list_get_head(&qp_guest_endpoints))) { ep = (struct qp_guest_endpoint *)entry; /* Don't make a hypercall for local queue_pairs. */ if (!(entry->flags & VMCI_QPFLAG_LOCAL)) qp_detatch_hypercall(entry->handle); /* We cannot fail the exit, so let's reset ref_count. */ entry->ref_count = 0; qp_list_remove_entry(&qp_guest_endpoints, entry); qp_guest_endpoint_destroy(ep); } mutex_unlock(&qp_guest_endpoints.mutex); } /* * Helper routine that will lock the queue pair before subsequent * operations. * Note: Non-blocking on the host side is currently only implemented in ESX. * Since non-blocking isn't yet implemented on the host personality we * have no reason to acquire a spin lock. So to avoid the use of an * unnecessary lock only acquire the mutex if we can block. */ static void qp_lock(const struct vmci_qp *qpair) { qp_acquire_queue_mutex(qpair->produce_q); } /* * Helper routine that unlocks the queue pair after calling * qp_lock. */ static void qp_unlock(const struct vmci_qp *qpair) { qp_release_queue_mutex(qpair->produce_q); } /* * The queue headers may not be mapped at all times. If a queue is * currently not mapped, it will be attempted to do so. */ static int qp_map_queue_headers(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int result; if (NULL == produce_q->q_header || NULL == consume_q->q_header) { result = qp_host_map_queues(produce_q, consume_q); if (result < VMCI_SUCCESS) return (produce_q->saved_header && consume_q->saved_header) ? VMCI_ERROR_QUEUEPAIR_NOT_READY : VMCI_ERROR_QUEUEPAIR_NOTATTACHED; } return VMCI_SUCCESS; } /* * Helper routine that will retrieve the produce and consume * headers of a given queue pair. If the guest memory of the * queue pair is currently not available, the saved queue headers * will be returned, if these are available. */ static int qp_get_queue_headers(const struct vmci_qp *qpair, struct vmci_queue_header **produce_q_header, struct vmci_queue_header **consume_q_header) { int result; result = qp_map_queue_headers(qpair->produce_q, qpair->consume_q); if (result == VMCI_SUCCESS) { *produce_q_header = qpair->produce_q->q_header; *consume_q_header = qpair->consume_q->q_header; } else if (qpair->produce_q->saved_header && qpair->consume_q->saved_header) { *produce_q_header = qpair->produce_q->saved_header; *consume_q_header = qpair->consume_q->saved_header; result = VMCI_SUCCESS; } return result; } /* * Callback from VMCI queue pair broker indicating that a queue * pair that was previously not ready, now either is ready or * gone forever. */ static int qp_wakeup_cb(void *client_data) { struct vmci_qp *qpair = (struct vmci_qp *)client_data; qp_lock(qpair); while (qpair->blocked > 0) { qpair->blocked--; qpair->generation++; wake_up(&qpair->event); } qp_unlock(qpair); return VMCI_SUCCESS; } /* * Makes the calling thread wait for the queue pair to become * ready for host side access. Returns true when thread is * woken up after queue pair state change, false otherwise. */ static bool qp_wait_for_ready_queue(struct vmci_qp *qpair) { unsigned int generation; qpair->blocked++; generation = qpair->generation; qp_unlock(qpair); wait_event(qpair->event, generation != qpair->generation); qp_lock(qpair); return true; } /* * Enqueues a given buffer to the produce queue using the provided * function. As many bytes as possible (space available in the queue) * are enqueued. Assumes the queue->mutex has been acquired. Returns * VMCI_ERROR_QUEUEPAIR_NOSPACE if no space was available to enqueue * data, VMCI_ERROR_INVALID_SIZE, if any queue pointer is outside the * queue (as defined by the queue size), VMCI_ERROR_INVALID_ARGS, if * an error occured when accessing the buffer, * VMCI_ERROR_QUEUEPAIR_NOTATTACHED, if the queue pair pages aren't * available. Otherwise, the number of bytes written to the queue is * returned. Updates the tail pointer of the produce queue. */ static ssize_t qp_enqueue_locked(struct vmci_queue *produce_q, struct vmci_queue *consume_q, const u64 produce_q_size, struct iov_iter *from) { s64 free_space; u64 tail; size_t buf_size = iov_iter_count(from); size_t written; ssize_t result; result = qp_map_queue_headers(produce_q, consume_q); if (unlikely(result != VMCI_SUCCESS)) return result; free_space = vmci_q_header_free_space(produce_q->q_header, consume_q->q_header, produce_q_size); if (free_space == 0) return VMCI_ERROR_QUEUEPAIR_NOSPACE; if (free_space < VMCI_SUCCESS) return (ssize_t) free_space; written = (size_t) (free_space > buf_size ? buf_size : free_space); tail = vmci_q_header_producer_tail(produce_q->q_header); if (likely(tail + written < produce_q_size)) { result = qp_memcpy_to_queue_iter(produce_q, tail, from, written); } else { /* Tail pointer wraps around. */ const size_t tmp = (size_t) (produce_q_size - tail); result = qp_memcpy_to_queue_iter(produce_q, tail, from, tmp); if (result >= VMCI_SUCCESS) result = qp_memcpy_to_queue_iter(produce_q, 0, from, written - tmp); } if (result < VMCI_SUCCESS) return result; /* * This virt_wmb() ensures that data written to the queue * is observable before the new producer_tail is. */ virt_wmb(); vmci_q_header_add_producer_tail(produce_q->q_header, written, produce_q_size); return written; } /* * Dequeues data (if available) from the given consume queue. Writes data * to the user provided buffer using the provided function. * Assumes the queue->mutex has been acquired. * Results: * VMCI_ERROR_QUEUEPAIR_NODATA if no data was available to dequeue. * VMCI_ERROR_INVALID_SIZE, if any queue pointer is outside the queue * (as defined by the queue size). * VMCI_ERROR_INVALID_ARGS, if an error occured when accessing the buffer. * Otherwise the number of bytes dequeued is returned. * Side effects: * Updates the head pointer of the consume queue. */ static ssize_t qp_dequeue_locked(struct vmci_queue *produce_q, struct vmci_queue *consume_q, const u64 consume_q_size, struct iov_iter *to, bool update_consumer) { size_t buf_size = iov_iter_count(to); s64 buf_ready; u64 head; size_t read; ssize_t result; result = qp_map_queue_headers(produce_q, consume_q); if (unlikely(result != VMCI_SUCCESS)) return result; buf_ready = vmci_q_header_buf_ready(consume_q->q_header, produce_q->q_header, consume_q_size); if (buf_ready == 0) return VMCI_ERROR_QUEUEPAIR_NODATA; if (buf_ready < VMCI_SUCCESS) return (ssize_t) buf_ready; /* * This virt_rmb() ensures that data from the queue will be read * after we have determined how much is ready to be consumed. */ virt_rmb(); read = (size_t) (buf_ready > buf_size ? buf_size : buf_ready); head = vmci_q_header_consumer_head(produce_q->q_header); if (likely(head + read < consume_q_size)) { result = qp_memcpy_from_queue_iter(to, consume_q, head, read); } else { /* Head pointer wraps around. */ const size_t tmp = (size_t) (consume_q_size - head); result = qp_memcpy_from_queue_iter(to, consume_q, head, tmp); if (result >= VMCI_SUCCESS) result = qp_memcpy_from_queue_iter(to, consume_q, 0, read - tmp); } if (result < VMCI_SUCCESS) return result; if (update_consumer) vmci_q_header_add_consumer_head(produce_q->q_header, read, consume_q_size); return read; } /* * vmci_qpair_alloc() - Allocates a queue pair. * @qpair: Pointer for the new vmci_qp struct. * @handle: Handle to track the resource. * @produce_qsize: Desired size of the producer queue. * @consume_qsize: Desired size of the consumer queue. * @peer: ContextID of the peer. * @flags: VMCI flags. * @priv_flags: VMCI priviledge flags. * * This is the client interface for allocating the memory for a * vmci_qp structure and then attaching to the underlying * queue. If an error occurs allocating the memory for the * vmci_qp structure no attempt is made to attach. If an * error occurs attaching, then the structure is freed. */ int vmci_qpair_alloc(struct vmci_qp **qpair, struct vmci_handle *handle, u64 produce_qsize, u64 consume_qsize, u32 peer, u32 flags, u32 priv_flags) { struct vmci_qp *my_qpair; int retval; struct vmci_handle src = VMCI_INVALID_HANDLE; struct vmci_handle dst = vmci_make_handle(peer, VMCI_INVALID_ID); enum vmci_route route; vmci_event_release_cb wakeup_cb; void *client_data; /* * Restrict the size of a queuepair. The device already * enforces a limit on the total amount of memory that can be * allocated to queuepairs for a guest. However, we try to * allocate this memory before we make the queuepair * allocation hypercall. On Linux, we allocate each page * separately, which means rather than fail, the guest will * thrash while it tries to allocate, and will become * increasingly unresponsive to the point where it appears to * be hung. So we place a limit on the size of an individual * queuepair here, and leave the device to enforce the * restriction on total queuepair memory. (Note that this * doesn't prevent all cases; a user with only this much * physical memory could still get into trouble.) The error * used by the device is NO_RESOURCES, so use that here too. */ if (!QP_SIZES_ARE_VALID(produce_qsize, consume_qsize)) return VMCI_ERROR_NO_RESOURCES; retval = vmci_route(&src, &dst, false, &route); if (retval < VMCI_SUCCESS) route = vmci_guest_code_active() ? VMCI_ROUTE_AS_GUEST : VMCI_ROUTE_AS_HOST; if (flags & (VMCI_QPFLAG_NONBLOCK | VMCI_QPFLAG_PINNED)) { pr_devel("NONBLOCK OR PINNED set"); return VMCI_ERROR_INVALID_ARGS; } my_qpair = kzalloc(sizeof(*my_qpair), GFP_KERNEL); if (!my_qpair) return VMCI_ERROR_NO_MEM; my_qpair->produce_q_size = produce_qsize; my_qpair->consume_q_size = consume_qsize; my_qpair->peer = peer; my_qpair->flags = flags; my_qpair->priv_flags = priv_flags; wakeup_cb = NULL; client_data = NULL; if (VMCI_ROUTE_AS_HOST == route) { my_qpair->guest_endpoint = false; if (!(flags & VMCI_QPFLAG_LOCAL)) { my_qpair->blocked = 0; my_qpair->generation = 0; init_waitqueue_head(&my_qpair->event); wakeup_cb = qp_wakeup_cb; client_data = (void *)my_qpair; } } else { my_qpair->guest_endpoint = true; } retval = vmci_qp_alloc(handle, &my_qpair->produce_q, my_qpair->produce_q_size, &my_qpair->consume_q, my_qpair->consume_q_size, my_qpair->peer, my_qpair->flags, my_qpair->priv_flags, my_qpair->guest_endpoint, wakeup_cb, client_data); if (retval < VMCI_SUCCESS) { kfree(my_qpair); return retval; } *qpair = my_qpair; my_qpair->handle = *handle; return retval; } EXPORT_SYMBOL_GPL(vmci_qpair_alloc); /* * vmci_qpair_detach() - Detatches the client from a queue pair. * @qpair: Reference of a pointer to the qpair struct. * * This is the client interface for detaching from a VMCIQPair. * Note that this routine will free the memory allocated for the * vmci_qp structure too. */ int vmci_qpair_detach(struct vmci_qp **qpair) { int result; struct vmci_qp *old_qpair; if (!qpair || !(*qpair)) return VMCI_ERROR_INVALID_ARGS; old_qpair = *qpair; result = qp_detatch(old_qpair->handle, old_qpair->guest_endpoint); /* * The guest can fail to detach for a number of reasons, and * if it does so, it will cleanup the entry (if there is one). * The host can fail too, but it won't cleanup the entry * immediately, it will do that later when the context is * freed. Either way, we need to release the qpair struct * here; there isn't much the caller can do, and we don't want * to leak. */ memset(old_qpair, 0, sizeof(*old_qpair)); old_qpair->handle = VMCI_INVALID_HANDLE; old_qpair->peer = VMCI_INVALID_ID; kfree(old_qpair); *qpair = NULL; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_detach); /* * vmci_qpair_get_produce_indexes() - Retrieves the indexes of the producer. * @qpair: Pointer to the queue pair struct. * @producer_tail: Reference used for storing producer tail index. * @consumer_head: Reference used for storing the consumer head index. * * This is the client interface for getting the current indexes of the * QPair from the point of the view of the caller as the producer. */ int vmci_qpair_get_produce_indexes(const struct vmci_qp *qpair, u64 *producer_tail, u64 *consumer_head) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; int result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) vmci_q_header_get_pointers(produce_q_header, consume_q_header, producer_tail, consumer_head); qp_unlock(qpair); if (result == VMCI_SUCCESS && ((producer_tail && *producer_tail >= qpair->produce_q_size) || (consumer_head && *consumer_head >= qpair->produce_q_size))) return VMCI_ERROR_INVALID_SIZE; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_get_produce_indexes); /* * vmci_qpair_get_consume_indexes() - Retrieves the indexes of the consumer. * @qpair: Pointer to the queue pair struct. * @consumer_tail: Reference used for storing consumer tail index. * @producer_head: Reference used for storing the producer head index. * * This is the client interface for getting the current indexes of the * QPair from the point of the view of the caller as the consumer. */ int vmci_qpair_get_consume_indexes(const struct vmci_qp *qpair, u64 *consumer_tail, u64 *producer_head) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; int result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) vmci_q_header_get_pointers(consume_q_header, produce_q_header, consumer_tail, producer_head); qp_unlock(qpair); if (result == VMCI_SUCCESS && ((consumer_tail && *consumer_tail >= qpair->consume_q_size) || (producer_head && *producer_head >= qpair->consume_q_size))) return VMCI_ERROR_INVALID_SIZE; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_get_consume_indexes); /* * vmci_qpair_produce_free_space() - Retrieves free space in producer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of free * space in the QPair from the point of the view of the caller as * the producer which is the common case. Returns < 0 if err, else * available bytes into which data can be enqueued if > 0. */ s64 vmci_qpair_produce_free_space(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_free_space(produce_q_header, consume_q_header, qpair->produce_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_produce_free_space); /* * vmci_qpair_consume_free_space() - Retrieves free space in consumer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of free * space in the QPair from the point of the view of the caller as * the consumer which is not the common case. Returns < 0 if err, else * available bytes into which data can be enqueued if > 0. */ s64 vmci_qpair_consume_free_space(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_free_space(consume_q_header, produce_q_header, qpair->consume_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_consume_free_space); /* * vmci_qpair_produce_buf_ready() - Gets bytes ready to read from * producer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of * enqueued data in the QPair from the point of the view of the * caller as the producer which is not the common case. Returns < 0 if err, * else available bytes that may be read. */ s64 vmci_qpair_produce_buf_ready(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_buf_ready(produce_q_header, consume_q_header, qpair->produce_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_produce_buf_ready); /* * vmci_qpair_consume_buf_ready() - Gets bytes ready to read from * consumer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of * enqueued data in the QPair from the point of the view of the * caller as the consumer which is the normal case. Returns < 0 if err, * else available bytes that may be read. */ s64 vmci_qpair_consume_buf_ready(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_buf_ready(consume_q_header, produce_q_header, qpair->consume_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_consume_buf_ready); /* * vmci_qpair_enqueue() - Throw data on the queue. * @qpair: Pointer to the queue pair struct. * @buf: Pointer to buffer containing data * @buf_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for enqueueing data into the queue. * Returns number of bytes enqueued or < 0 on error. */ ssize_t vmci_qpair_enqueue(struct vmci_qp *qpair, const void *buf, size_t buf_size, int buf_type) { ssize_t result; struct iov_iter from; struct kvec v = {.iov_base = (void *)buf, .iov_len = buf_size}; if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; iov_iter_kvec(&from, ITER_SOURCE, &v, 1, buf_size); qp_lock(qpair); do { result = qp_enqueue_locked(qpair->produce_q, qpair->consume_q, qpair->produce_q_size, &from); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_enqueue); /* * vmci_qpair_dequeue() - Get data from the queue. * @qpair: Pointer to the queue pair struct. * @buf: Pointer to buffer for the data * @buf_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for dequeueing data from the queue. * Returns number of bytes dequeued or < 0 on error. */ ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair, void *buf, size_t buf_size, int buf_type) { ssize_t result; struct iov_iter to; struct kvec v = {.iov_base = buf, .iov_len = buf_size}; if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; iov_iter_kvec(&to, ITER_DEST, &v, 1, buf_size); qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &to, true); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_dequeue); /* * vmci_qpair_peek() - Peek at the data in the queue. * @qpair: Pointer to the queue pair struct. * @buf: Pointer to buffer for the data * @buf_size: Length of buffer. * @buf_type: Buffer type (Unused on Linux). * * This is the client interface for peeking into a queue. (I.e., * copy data from the queue without updating the head pointer.) * Returns number of bytes dequeued or < 0 on error. */ ssize_t vmci_qpair_peek(struct vmci_qp *qpair, void *buf, size_t buf_size, int buf_type) { struct iov_iter to; struct kvec v = {.iov_base = buf, .iov_len = buf_size}; ssize_t result; if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; iov_iter_kvec(&to, ITER_DEST, &v, 1, buf_size); qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &to, false); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_peek); /* * vmci_qpair_enquev() - Throw data on the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer containing data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for enqueueing data into the queue. * This function uses IO vectors to handle the work. Returns number * of bytes enqueued or < 0 on error. */ ssize_t vmci_qpair_enquev(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_enqueue_locked(qpair->produce_q, qpair->consume_q, qpair->produce_q_size, &msg->msg_iter); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_enquev); /* * vmci_qpair_dequev() - Get data from the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer for the data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for dequeueing data from the queue. * This function uses IO vectors to handle the work. Returns number * of bytes dequeued or < 0 on error. */ ssize_t vmci_qpair_dequev(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &msg->msg_iter, true); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_dequev); /* * vmci_qpair_peekv() - Peek at the data in the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer for the data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused on Linux). * * This is the client interface for peeking into a queue. (I.e., * copy data from the queue without updating the head pointer.) * This function uses IO vectors to handle the work. Returns number * of bytes peeked or < 0 on error. */ ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &msg->msg_iter, false); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_peekv);
88 27 24 11 31 36 24 19 36 13 36 36 27 11 36 30 9 35 5 17 17 32 22 17 32 2 12 32 30 27 26 27 17 3 3 3 15 31 31 31 13 13 22 17 32 32 31 22 17 17 17 17 2 2 2 17 17 17 17 20 20 22 32 32 32 32 32 32 32 17 32 31 16 27 13 27 16 16 31 30 30 27 27 27 31 30 21 21 21 17 17 17 17 17 95 95 110 110 111 95 95 3 2 2 2 3 23 24 25 25 25 24 23 22 20 22 22 22 22 16 16 16 6 6 6 52 88 5 5 5 5 4 1 6 4 1 1 5 5 5 4 4 4 115 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 3 2 2 1 1 5 9 9 4 4 4 4 4 4 4 4 4 9 4 5 5 5 5 2 3 3 3 5 4 4 4 3 3 3 5 5 9 12 12 12 12 12 10 10 10 10 10 10 10 10 10 19 19 19 3 2 2 2 19 17 17 17 15 15 10 10 5 17 17 14 15 15 15 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 41 1 1 86 10 6 2 44 47 12 86 66 1 1 1 1 1 1 1 1 62 62 63 62 32 63 28 63 63 1 62 37 1 63 60 60 60 42 29 60 60 60 60 34 34 25 60 60 2 4 4 4 3 4 3 2 2 4 4 4 4 4 4 4 4 1 2 2 2 2 4 1 3 4 3 4 7 3 7 5 2 5 2 1 3 2 3 3 3 7 2 11 3 3 14 14 2 12 5 12 9 1 11 2 9 7 7 5 7 2 7 7 7 26 26 23 23 2 22 21 14 14 8 19 26 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_tunnel.h" static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; u16 vid = *(u16 *)arg->key; return vle->vid != vid; } static const struct rhashtable_params br_vlan_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, vnode), .key_offset = offsetof(struct net_bridge_vlan, vid), .key_len = sizeof(u16), .nelem_hint = 3, .max_size = VLAN_N_VID, .obj_cmpfn = br_vlan_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) { return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, const struct net_bridge_vlan *v) { if (vg->pvid == v->vid) return; smp_wmb(); br_vlan_set_pvid_state(vg, v->state); vg->pvid = v->vid; } static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid != vid) return; smp_wmb(); vg->pvid = 0; } /* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. */ static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, bool commit) { struct net_bridge_vlan_group *vg; bool change; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); /* check if anything would be changed on commit */ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); if (!commit) goto out; if (flags & BRIDGE_VLAN_INFO_PVID) __vlan_add_pvid(vg, v); else __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; out: return change; } static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) { return __vlan_flags_update(v, flags, false); } static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) { __vlan_flags_update(v, flags, true); } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, v->vid); v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; return err; } static void __vlan_add_list(struct net_bridge_vlan *v) { struct net_bridge_vlan_group *vg; struct list_head *headp, *hpos; struct net_bridge_vlan *vent; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); if (v->vid >= vent->vid) break; } list_add_rcu(&v->vlist, hpos); } static void __vlan_del_list(struct net_bridge_vlan *v) { list_del_rcu(&v->vlist); } static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, const struct net_bridge_vlan *v) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q del. */ err = br_switchdev_port_vlan_del(dev, v->vid); if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) vlan_vid_del(dev, br->vlan_proto, v->vid); return err == -EOPNOTSUPP ? 0 : err; } /* Returns a master vlan, if it didn't exist it gets created. In all cases * a reference is taken to the master vlan before returning. */ static struct net_bridge_vlan * br_vlan_get_master(struct net_bridge *br, u16 vid, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *masterv; vg = br_vlan_group(br); masterv = br_vlan_find(vg, vid); if (!masterv) { bool changed; /* missing global ctx, create it now */ if (br_vlan_add(br, vid, 0, &changed, extack)) return NULL; masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) return NULL; refcount_set(&masterv->refcnt, 1); return masterv; } refcount_inc(&masterv->refcnt); return masterv; } static void br_master_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(!br_vlan_is_master(v)); free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_put_master(struct net_bridge_vlan *masterv) { struct net_bridge_vlan_group *vg; if (!br_vlan_is_master(masterv)) return; vg = br_vlan_group(masterv->br); if (refcount_dec_and_test(&masterv->refcnt)) { rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); br_multicast_toggle_one_vlan(masterv, false); br_multicast_ctx_deinit(&masterv->br_mcast_ctx); call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } static void nbp_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(br_vlan_is_master(v)); /* if we had per-port stats configured then free them here */ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_init_state(struct net_bridge_vlan *v) { struct net_bridge *br; if (br_vlan_is_master(v)) br = v->br; else br = v->port->br; if (br_opt_get(br, BROPT_MST_ENABLED)) { br_mst_vlan_init_state(v); return; } v->state = BR_STATE_FORWARDING; v->msti = 0; } /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: * 1. vlan is being added on a port (no master flags, global entry exists) * 2. vlan is being added on a bridge (both master and brentry flags) * 3. vlan is being added on a port, but a global entry didn't exist which * is being created right now (master flag set, brentry flag unset), the * global entry is used for global per-vlan features, but not for filtering * 4. same as 3 but with both master and brentry flags set so the entry * will be used for filtering in both the port and the bridge */ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan *masterv = NULL; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_device *dev; struct net_bridge *br; int err; if (br_vlan_is_master(v)) { br = v->br; dev = br->dev; vg = br_vlan_group(br); } else { p = v->port; br = p->br; dev = p->dev; vg = nbp_vlan_group(p); } if (p) { /* Add VLAN to the device filter if it is supported. * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ err = __vlan_vid_add(dev, br, v, flags, extack); if (err) goto out; /* need to work on the master vlan too */ if (flags & BRIDGE_VLAN_INFO_MASTER) { bool changed; err = br_vlan_add(br, v->vid, flags | BRIDGE_VLAN_INFO_BRENTRY, &changed, extack); if (err) goto out_filt; if (changed) br_vlan_notify(br, NULL, v->vid, 0, RTM_NEWVLAN); } masterv = br_vlan_get_master(br, v->vid, extack); if (!masterv) { err = -ENOMEM; goto out_filt; } v->brvlan = masterv; if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { v->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!v->stats) { err = -ENOMEM; goto out_filt; } v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; } else { v->stats = masterv->stats; } br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { if (br_vlan_should_use(v)) { err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err && err != -EOPNOTSUPP) goto out; } br_multicast_ctx_init(br, v, &v->br_mcast_ctx); v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); if (err) { br_err(br, "failed insert local address into bridge forwarding table\n"); goto out_filt; } vg->num_vlans++; } /* set the state before publishing */ br_vlan_init_state(v); err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); if (err) goto out_fdb_insert; __vlan_add_list(v); __vlan_flags_commit(v, flags); br_multicast_toggle_one_vlan(v, true); if (p) nbp_vlan_set_vlan_dev_state(p, v->vid); out: return err; out_fdb_insert: if (br_vlan_should_use(v)) { br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); vg->num_vlans--; } out_filt: if (p) { __vlan_vid_del(dev, br, v); if (masterv) { if (v->stats && masterv->stats != v->stats) free_percpu(v->stats); v->stats = NULL; br_vlan_put_master(masterv); v->brvlan = NULL; } } else { br_switchdev_port_vlan_del(dev, v->vid); } goto out; } static int __vlan_del(struct net_bridge_vlan *v) { struct net_bridge_vlan *masterv = v; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0; if (br_vlan_is_master(v)) { vg = br_vlan_group(v->br); } else { p = v->port; vg = nbp_vlan_group(v->port); masterv = v->brvlan; } __vlan_delete_pvid(vg, v->vid); if (p) { err = __vlan_vid_del(p->dev, p->br, v); if (err) goto out; } else { err = br_switchdev_port_vlan_del(v->br->dev, v->vid); if (err && err != -EOPNOTSUPP) goto out; err = 0; } if (br_vlan_should_use(v)) { v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans--; } if (masterv != v) { vlan_tunnel_info_del(vg, v); rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); nbp_vlan_set_vlan_dev_state(p, v->vid); br_multicast_toggle_one_vlan(v, false); br_multicast_port_ctx_deinit(&v->port_mcast_ctx); call_rcu(&v->rcu, nbp_vlan_rcu_free); } br_vlan_put_master(masterv); out: return err; } static void __vlan_group_free(struct net_bridge_vlan_group *vg) { WARN_ON(!list_empty(&vg->vlan_list)); rhashtable_destroy(&vg->vlan_hash); vlan_tunnel_deinit(vg); kfree(vg); } static void __vlan_flush(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; u16 v_start = 0, v_end = 0; int err; __vlan_delete_pvid(vg, vg->pvid); list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { /* take care of disjoint ranges */ if (!v_start) { v_start = vlan->vid; } else if (vlan->vid - v_end != 1) { /* found range end, notify and start next one */ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); v_start = vlan->vid; } v_end = vlan->vid; err = __vlan_del(vlan); if (err) { br_err(br, "port %u(%s) failed to delete vlan %d: %pe\n", (unsigned int) p->port_no, p->dev->name, vlan->vid, ERR_PTR(err)); } } /* notify about the last/whole vlan range */ if (v_start) br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); } struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; /* At this point, we know that the frame was filtered and contains * a valid vlan id. If the vlan id has untagged flag set, * send untagged; otherwise, send tagged. */ br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); /* Vlan entry must be configured at this point. The * only exception is the bridge is set in promisc mode and the * packet is destined for the bridge device. In this case * pass the packet as is. */ if (!v || !br_vlan_should_use(v)) { if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { goto out; } else { kfree_skb(skb); return NULL; } } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->tx_bytes, skb->len); u64_stats_inc(&stats->tx_packets); u64_stats_update_end(&stats->syncp); } /* If the skb will be sent using forwarding offload, the assumption is * that the switchdev will inject the packet into hardware together * with the bridge VLAN, so that it can be forwarded according to that * VLAN. The switchdev should deal with popping the VLAN header in * hardware on each egress port as appropriate. So only strip the VLAN * header if forwarding offload is not being used. */ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && br_handle_egress_vlan_tunnel(skb, v)) { kfree_skb(skb); return NULL; } out: return skb; } /* Called under RCU */ static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; bool tagged; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; /* If vlan tx offload is disabled on bridge device and frame was * sent from vlan device on the bridge device, it does not have * HW accelerated vlan tag. */ if (unlikely(!skb_vlan_tag_present(skb) && skb->protocol == br->vlan_proto)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; } if (!br_vlan_get_tag(skb, vid)) { /* Tagged frame */ if (skb->vlan_proto != br->vlan_proto) { /* Protocol-mismatch, empty out vlan_tci for new tag */ skb_push(skb, ETH_HLEN); skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (unlikely(!skb)) return false; skb_pull(skb, ETH_HLEN); skb_reset_mac_len(skb); *vid = 0; tagged = false; } else { tagged = true; } } else { /* Untagged frame */ tagged = false; } if (!*vid) { u16 pvid = br_get_pvid(vg); /* Frame had a tag with VID 0 or did not have a tag. * See if pvid is set on this port. That tells us which * vlan untagged or priority-tagged traffic belongs to. */ if (!pvid) goto drop; /* PVID is set on this port. Any untagged or priority-tagged * ingress frame is considered to belong to this vlan. */ *vid = pvid; if (likely(!tagged)) /* Untagged Frame. */ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. * At this point, we know that skb->vlan_tci VID * field was 0. * We update only VID field and preserve PCP field. */ skb->vlan_tci |= pvid; /* if snooping and stats are disabled we can avoid the lookup */ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); if (!br_vlan_state_allowed(*state, true)) goto drop; } return true; } } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_state(v); if (!br_vlan_state_allowed(*state, true)) goto drop; } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->rx_bytes, skb->len); u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } *vlan = v; return true; drop: kfree_skb(skb); return false; } bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ *vlan = NULL; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } return __allowed_ingress(br, vg, skb, vid, state, vlan); } /* Called under RCU. */ bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { const struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); if (v && br_vlan_should_use(v) && br_vlan_state_allowed(br_vlan_get_state(v), false)) return true; return false; } /* Called under RCU */ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; struct net_bridge_vlan *v; /* If filtering was disabled at input, let it pass. */ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return true; vg = nbp_vlan_group_rcu(p); if (!vg || !vg->num_vlans) return false; if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) *vid = 0; if (!*vid) { *vid = br_get_pvid(vg); if (!*vid || !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) return false; return true; } v = br_vlan_find(vg, *vid); if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) return true; return false; } static int br_vlan_add_existing(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u16 flags, bool *changed, struct netlink_ext_ack *extack) { bool would_change = __vlan_flags_would_change(vlan, flags); bool becomes_brentry = false; int err; if (!br_vlan_is_brentry(vlan)) { /* Trying to change flags of non-existent bridge vlan */ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) return -EINVAL; becomes_brentry = true; } /* Master VLANs that aren't brentries weren't notified before, * time to notify them now. */ if (becomes_brentry || would_change) { err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, would_change, extack); if (err && err != -EOPNOTSUPP) return err; } if (becomes_brentry) { /* It was only kept for port vlans, now make it real */ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); if (err) { br_err(br, "failed to insert local address into bridge forwarding table\n"); goto err_fdb_insert; } refcount_inc(&vlan->refcnt); vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans++; *changed = true; br_multicast_toggle_one_vlan(vlan, true); } __vlan_flags_commit(vlan, flags); if (would_change) *changed = true; return 0; err_fdb_insert: br_switchdev_port_vlan_del(br->dev, vlan->vid); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); if (vlan) return br_vlan_add_existing(br, vg, vlan, flags, changed, extack); vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vlan->stats) { kfree(vlan); return -ENOMEM; } vlan->vid = vid; vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; vlan->br = br; if (flags & BRIDGE_VLAN_INFO_BRENTRY) refcount_set(&vlan->refcnt, 1); ret = __vlan_add(vlan, flags, extack); if (ret) { free_percpu(vlan->stats); kfree(vlan); } else { *changed = true; } return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int br_vlan_delete(struct net_bridge *br, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = br_vlan_group(br); v = br_vlan_find(vg, vid); if (!v || !br_vlan_is_brentry(v)) return -ENOENT; br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); br_fdb_delete_by_port(br, NULL, vid, 0); vlan_tunnel_info_del(vg, v); return __vlan_del(v); } void br_vlan_flush(struct net_bridge *br) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = br_vlan_group(br); __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); synchronize_net(); __vlan_group_free(vg); } struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { if (!vg) return NULL; return br_vlan_lookup(&vg->vlan_hash, vid); } /* Must be protected by RTNL. */ static void recalculate_group_addr(struct net_bridge *br) { if (br_opt_get(br, BROPT_GROUP_ADDR_SET)) return; spin_lock_bh(&br->lock); if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) { /* Bridge Group Address */ br->group_addr[5] = 0x00; } else { /* vlan_enabled && ETH_P_8021AD */ /* Provider Bridge Group Address */ br->group_addr[5] = 0x08; } spin_unlock_bh(&br->lock); } /* Must be protected by RTNL. */ void br_recalculate_fwd_mask(struct net_bridge *br) { if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; else /* vlan_enabled && ETH_P_8021AD */ br->group_fwd_mask_required = BR_GROUPFWD_8021AD & ~(1u << br->group_addr[5]); } int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = val, }; int err; if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) { br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); return err; } br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); br_multicast_toggle_vlan_snooping(br, false, NULL); } return 0; } bool br_vlan_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); return br_opt_get(br, BROPT_VLAN_ENABLED); } EXPORT_SYMBOL_GPL(br_vlan_enabled); int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) { struct net_bridge *br = netdev_priv(dev); *p_proto = ntohs(br->vlan_proto); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_proto); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_protocol = ntohs(proto), }; int err = 0; struct net_bridge_port *p; struct net_bridge_vlan *vlan; struct net_bridge_vlan_group *vg; __be16 oldproto = br->vlan_proto; if (br->vlan_proto == proto) return 0; err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; } } br->vlan_proto = proto; recalculate_group_addr(br); br_recalculate_fwd_mask(br); /* Delete VLANs for the old proto from the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, oldproto, vlan->vid); } } return 0; err_filt: attr.u.vlan_protocol = ntohs(oldproto); switchdev_port_attr_set(br->dev, &attr, NULL); list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } list_for_each_entry_continue_reverse(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } } return err; } int br_vlan_set_proto(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (!eth_type_vlan(htons(val))) return -EPROTONOSUPPORT; return __br_vlan_set_proto(br, htons(val), extack); } int br_vlan_set_stats(struct net_bridge *br, unsigned long val) { switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val); break; default: return -EINVAL; } return 0; } int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) { struct net_bridge_port *p; /* allow to change the option if there are no port vlans configured */ list_for_each_entry(p, &br->port_list, list) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); if (vg->num_vlans) return -EBUSY; } switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); break; default: return -EINVAL; } return 0; } static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { struct net_bridge_vlan *v; if (vid != vg->pvid) return false; v = br_vlan_lookup(&vg->vlan_hash, vid); if (v && br_vlan_should_use(v) && (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return true; return false; } static void br_vlan_disable_default_pvid(struct net_bridge *br) { struct net_bridge_port *p; u16 pvid = br->default_pvid; /* Disable default_pvid on all ports where it is still * configured. */ if (vlan_default_pvid(br_vlan_group(br), pvid)) { if (!br_vlan_delete(br, pvid)) br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } list_for_each_entry(p, &br->port_list, list) { if (vlan_default_pvid(nbp_vlan_group(p), pvid) && !nbp_vlan_delete(p, pvid)) br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } br->default_pvid = 0; } int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack) { const struct net_bridge_vlan *pvent; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; unsigned long *changed; bool vlchange; u16 old_pvid; int err = 0; if (!pvid) { br_vlan_disable_default_pvid(br); return 0; } changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!changed) return -ENOMEM; old_pvid = br->default_pvid; /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = br_vlan_group(br); pvent = br_vlan_find(vg, pvid); if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && (!pvent || !br_vlan_should_use(pvent))) { err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, extack); if (err) goto out; if (br_vlan_delete(br, old_pvid)) br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); __set_bit(0, changed); } list_for_each_entry(p, &br->port_list, list) { /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = nbp_vlan_group(p); if ((old_pvid && !vlan_default_pvid(vg, old_pvid)) || br_vlan_find(vg, pvid)) continue; err = nbp_vlan_add(p, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, extack); if (err) goto err_port; if (nbp_vlan_delete(p, old_pvid)) br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); __set_bit(p->port_no, changed); } br->default_pvid = pvid; out: bitmap_free(changed); return err; err_port: list_for_each_entry_continue_reverse(p, &br->port_list, list) { if (!test_bit(p->port_no, changed)) continue; if (old_pvid) { nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, NULL); br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); } nbp_vlan_delete(p, pvid); br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } if (test_bit(0, changed)) { if (old_pvid) { br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, NULL); br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); } br_vlan_delete(br, pvid); br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } goto out; } int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { u16 pvid = val; int err = 0; if (val >= VLAN_VID_MASK) return -EINVAL; if (pvid == br->default_pvid) goto out; /* Only allow default pvid change when filtering is disabled */ if (br_opt_get(br, BROPT_VLAN_ENABLED)) { pr_info_once("Please disable vlan filtering to change default_pvid\n"); err = -EPERM; goto out; } err = __br_vlan_set_default_pvid(br, pvid, extack); out: return err; } int br_vlan_init(struct net_bridge *br) { struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc(sizeof(*vg), GFP_KERNEL); if (!vg) goto out; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; rcu_assign_pointer(br->vlgrp, vg); out: return ret; err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: kfree(vg); goto out; } int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED), }; struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); if (!vg) goto out; ret = switchdev_port_attr_set(p->dev, &attr, extack); if (ret && ret != -EOPNOTSUPP) goto err_vlan_enabled; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); rcu_assign_pointer(p->vlgrp, vg); if (p->br->default_pvid) { bool changed; ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &changed, extack); if (ret) goto err_vlan_add; br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); } out: return ret; err_vlan_add: RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); vlan_tunnel_deinit(vg); err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: err_vlan_enabled: kfree(vg); goto out; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { bool would_change = __vlan_flags_would_change(vlan, flags); if (would_change) { /* Pass the flags to the hardware bridge */ ret = br_switchdev_port_vlan_add(port->dev, vid, flags, true, extack); if (ret && ret != -EOPNOTSUPP) return ret; } __vlan_flags_commit(vlan, flags); *changed = would_change; return 0; } vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->vid = vid; vlan->port = port; ret = __vlan_add(vlan, flags, extack); if (ret) kfree(vlan); else *changed = true; return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan *v; ASSERT_RTNL(); v = br_vlan_find(nbp_vlan_group(port), vid); if (!v) return -ENOENT; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); br_fdb_delete_by_port(port->br, port, vid, 0); return __vlan_del(v); } void nbp_vlan_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); synchronize_net(); __vlan_group_free(vg); } void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { u64 rxpackets, rxbytes, txpackets, txbytes; struct pcpu_sw_netstats *cpu_stats; unsigned int start; cpu_stats = per_cpu_ptr(v->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); rxpackets = u64_stats_read(&cpu_stats->rx_packets); rxbytes = u64_stats_read(&cpu_stats->rx_bytes); txbytes = u64_stats_read(&cpu_stats->tx_bytes); txpackets = u64_stats_read(&cpu_stats->tx_packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); u64_stats_add(&stats->tx_bytes, txbytes); u64_stats_add(&stats->tx_packets, txpackets); } } int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid); int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); el