Total coverage: 447319 (26%)of 1783054
215 3 152 1 209 209 170 209 209 209 210 171 171 201 201 201 202 153 3 153 64 75 120 64 105 118 70 110 22 94 75 75 78 1 78 119 120 152 152 151 1 152 152 113 152 115 114 4 64 64 53 16 16 16 115 115 112 209 208 62 153 152 2 152 152 115 175 175 171 209 52 52 4 9 9 9 9 157 150 150 2 2 149 37 32 5 5 160 154 149 37 37 37 151 150 149 4 32 157 155 44 27 45 22 161 2 148 137 37 138 148 48 149 44 55 152 153 2 18 18 148 148 149 29 106 105 106 105 105 105 105 105 105 106 100 100 100 100 100 2 99 91 99 100 100 159 55 106 159 159 159 1 159 158 159 34 1 34 34 34 29 2 29 2 2 29 27 2 40 40 21 40 20 20 20 20 13 13 13 13 13 13 20 12 11 6 5 5 11 5 5 5 4 28 27 28 27 2 2 2 25 26 26 26 26 24 24 24 28 28 28 30 11 11 30 19 11 2 11 3 3 11 2 9 1 1 8 8 28 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/dir.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/signal.h> #include <linux/unicode.h> #include "f2fs.h" #include "node.h" #include "acl.h" #include "xattr.h" #include <trace/events/f2fs.h> #if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif static unsigned long dir_blocks(struct inode *inode) { return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) >> PAGE_SHIFT; } static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) return BIT(level + dir_level); else return MAX_DIR_BUCKETS; } static unsigned int bucket_blocks(unsigned int level) { if (level < MAX_DIR_HASH_DEPTH / 2) return 2; else return 4; } #if IS_ENABLED(CONFIG_UNICODE) /* If @dir is casefolded, initialize @fname->cf_name from @fname->usr_fname. */ int f2fs_init_casefolded_name(const struct inode *dir, struct f2fs_filename *fname) { struct super_block *sb = dir->i_sb; unsigned char *buf; int len; if (IS_CASEFOLDED(dir) && !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { buf = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!buf) return -ENOMEM; len = utf8_casefold(sb->s_encoding, fname->usr_fname, buf, F2FS_NAME_LEN); if (len <= 0) { kmem_cache_free(f2fs_cf_name_slab, buf); if (sb_has_strict_encoding(sb)) return -EINVAL; /* fall back to treating name as opaque byte sequence */ return 0; } fname->cf_name.name = buf; fname->cf_name.len = len; } return 0; } void f2fs_free_casefolded_name(struct f2fs_filename *fname) { unsigned char *buf = (unsigned char *)fname->cf_name.name; if (buf) { kmem_cache_free(f2fs_cf_name_slab, buf); fname->cf_name.name = NULL; } } #endif /* CONFIG_UNICODE */ static int __f2fs_setup_filename(const struct inode *dir, const struct fscrypt_name *crypt_name, struct f2fs_filename *fname) { int err; memset(fname, 0, sizeof(*fname)); fname->usr_fname = crypt_name->usr_fname; fname->disk_name = crypt_name->disk_name; #ifdef CONFIG_FS_ENCRYPTION fname->crypto_buf = crypt_name->crypto_buf; #endif if (crypt_name->is_nokey_name) { /* hash was decoded from the no-key name */ fname->hash = cpu_to_le32(crypt_name->hash); } else { err = f2fs_init_casefolded_name(dir, fname); if (err) { f2fs_free_filename(fname); return err; } f2fs_hash_filename(dir, fname); } return 0; } /* * Prepare to search for @iname in @dir. This is similar to * fscrypt_setup_filename(), but this also handles computing the casefolded name * and the f2fs dirhash if needed, then packing all the information about this * filename up into a 'struct f2fs_filename'. */ int f2fs_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct f2fs_filename *fname) { struct fscrypt_name crypt_name; int err; err = fscrypt_setup_filename(dir, iname, lookup, &crypt_name); if (err) return err; return __f2fs_setup_filename(dir, &crypt_name, fname); } /* * Prepare to look up @dentry in @dir. This is similar to * fscrypt_prepare_lookup(), but this also handles computing the casefolded name * and the f2fs dirhash if needed, then packing all the information about this * filename up into a 'struct f2fs_filename'. */ int f2fs_prepare_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_filename *fname) { struct fscrypt_name crypt_name; int err; err = fscrypt_prepare_lookup(dir, dentry, &crypt_name); if (err) return err; return __f2fs_setup_filename(dir, &crypt_name, fname); } void f2fs_free_filename(struct f2fs_filename *fname) { #ifdef CONFIG_FS_ENCRYPTION kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; #endif f2fs_free_casefolded_name(fname); } static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) bidx += mul_u32_u32(dir_buckets(i, dir_level), bucket_blocks(i)); bidx += idx * bucket_blocks(level); return bidx; } static struct f2fs_dir_entry *find_in_block(struct inode *dir, struct page *dentry_page, const struct f2fs_filename *fname, int *max_slots, bool use_hash) { struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr d; dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); make_dentry_ptr_block(dir, &d, dentry_blk); return f2fs_find_target_dentry(&d, fname, max_slots, use_hash); } static inline int f2fs_match_name(const struct inode *dir, const struct f2fs_filename *fname, const u8 *de_name, u32 de_name_len) { struct fscrypt_name f; #if IS_ENABLED(CONFIG_UNICODE) if (fname->cf_name.name) return generic_ci_match(dir, fname->usr_fname, &fname->cf_name, de_name, de_name_len); #endif f.usr_fname = fname->usr_fname; f.disk_name = fname->disk_name; #ifdef CONFIG_FS_ENCRYPTION f.crypto_buf = fname->crypto_buf; #endif return fscrypt_match_name(&f, de_name, de_name_len); } struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, const struct f2fs_filename *fname, int *max_slots, bool use_hash) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; int res = 0; if (max_slots) *max_slots = 0; while (bit_pos < d->max) { if (!test_bit_le(bit_pos, d->bitmap)) { bit_pos++; max_len++; continue; } de = &d->dentry[bit_pos]; if (unlikely(!de->name_len)) { bit_pos++; continue; } if (!use_hash || de->hash_code == fname->hash) { res = f2fs_match_name(d->inode, fname, d->filename[bit_pos], le16_to_cpu(de->name_len)); if (res < 0) return ERR_PTR(res); if (res) goto found; } if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; found: if (max_slots && max_len > *max_slots) *max_slots = max_len; return de; } static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, const struct f2fs_filename *fname, struct page **res_page, bool use_hash) { int s = GET_DENTRY_SLOTS(fname->disk_name.len); unsigned int nbucket, nblock; unsigned int bidx, end_block, bucket_no; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; pgoff_t next_pgofs; bool room = false; int max_slots; nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); bucket_no = use_hash ? le32_to_cpu(fname->hash) % nbucket : 0; start_find_bucket: bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, bucket_no); end_block = bidx + nblock; while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { room = true; bidx = next_pgofs; continue; } else { *res_page = dentry_page; break; } } de = find_in_block(dir, dentry_page, fname, &max_slots, use_hash); if (IS_ERR(de)) { *res_page = ERR_CAST(de); de = NULL; break; } else if (de) { *res_page = dentry_page; break; } if (max_slots >= s) room = true; f2fs_put_page(dentry_page, 0); bidx++; } if (de) return de; if (likely(use_hash)) { if (room && F2FS_I(dir)->chash != fname->hash) { F2FS_I(dir)->chash = fname->hash; F2FS_I(dir)->clevel = level; } } else if (++bucket_no < nbucket) { goto start_find_bucket; } return NULL; } struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, const struct f2fs_filename *fname, struct page **res_page) { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; unsigned int max_depth; unsigned int level; bool use_hash = true; *res_page = NULL; #if IS_ENABLED(CONFIG_UNICODE) start_find_entry: #endif if (f2fs_has_inline_dentry(dir)) { de = f2fs_find_in_inline_dir(dir, fname, res_page, use_hash); goto out; } if (npages == 0) goto out; max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %lu: %u", dir->i_ino, max_depth); max_depth = MAX_DIR_HASH_DEPTH; f2fs_i_depth_write(dir, max_depth); } for (level = 0; level < max_depth; level++) { de = find_in_level(dir, level, fname, res_page, use_hash); if (de || IS_ERR(*res_page)) break; } out: #if IS_ENABLED(CONFIG_UNICODE) if (IS_CASEFOLDED(dir) && !de && use_hash) { use_hash = false; goto start_find_entry; } #endif /* This is to increase the speed of f2fs_create */ if (!de) F2FS_I(dir)->task = current; return de; } /* * Find an entry in the specified directory with the wanted name. * It returns the page where the entry was found (as a parameter - res_page), * and the entry itself. Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, const struct qstr *child, struct page **res_page) { struct f2fs_dir_entry *de = NULL; struct f2fs_filename fname; int err; err = f2fs_setup_filename(dir, child, 1, &fname); if (err) { if (err == -ENOENT) *res_page = NULL; else *res_page = ERR_PTR(err); return NULL; } de = __f2fs_find_entry(dir, &fname, res_page); f2fs_free_filename(&fname); return de; } struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { return f2fs_find_entry(dir, &dotdot_name, p); } ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page) { ino_t res = 0; struct f2fs_dir_entry *de; de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); f2fs_put_page(*page, 0); } return res; } void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); de->file_type = fs_umode_to_ftype(inode->i_mode); set_page_dirty(page); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } static void init_dent_inode(struct inode *dir, struct inode *inode, const struct f2fs_filename *fname, struct page *ipage) { struct f2fs_inode *ri; if (!fname) /* tmpfile case? */ return; f2fs_wait_on_page_writeback(ipage, NODE, true, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); if (IS_ENCRYPTED(dir)) { file_set_enc_name(inode); /* * Roll-forward recovery doesn't have encryption keys available, * so it can't compute the dirhash for encrypted+casefolded * filenames. Append it to i_name if possible. Else, disable * roll-forward recovery of the dentry (i.e., make fsync'ing the * file force a checkpoint) by setting LOST_PINO. */ if (IS_CASEFOLDED(dir)) { if (fname->disk_name.len + sizeof(f2fs_hash_t) <= F2FS_NAME_LEN) put_unaligned(fname->hash, (f2fs_hash_t *) &ri->i_name[fname->disk_name.len]); else file_lost_pino(inode); } } set_page_dirty(ipage); } void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { struct fscrypt_str dot = FSTR_INIT(".", 1); struct fscrypt_str dotdot = FSTR_INIT("..", 2); /* update dirent of "." */ f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); /* update dirent of ".." */ f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); } static int make_empty_dir(struct inode *inode, struct inode *parent, struct page *page) { struct page *dentry_page; struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr d; if (f2fs_has_inline_dentry(inode)) return f2fs_make_empty_inline_dir(inode, parent, page); dentry_page = f2fs_get_new_data_page(inode, page, 0, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_do_make_empty_dir(inode, parent, &d); set_page_dirty(dentry_page); f2fs_put_page(dentry_page, 1); return 0; } struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, const struct f2fs_filename *fname, struct page *dpage) { struct page *page; int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { page = f2fs_new_inode_page(inode); if (IS_ERR(page)) return page; if (S_ISDIR(inode->i_mode)) { /* in order to handle error case */ get_page(page); err = make_empty_dir(inode, dir, page); if (err) { lock_page(page); goto put_error; } put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); if (err) goto put_error; err = f2fs_init_security(inode, dir, fname ? fname->usr_fname : NULL, page); if (err) goto put_error; if (IS_ENCRYPTED(inode)) { err = fscrypt_set_context(inode, page); if (err) goto put_error; } } else { page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } init_dent_inode(dir, inode, fname, page); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { if (!S_ISDIR(inode->i_mode)) file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. */ if (inode->i_nlink == 0) f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); f2fs_i_links_write(inode, true); } return page; put_error: clear_nlink(inode); f2fs_update_inode(inode, page); f2fs_put_page(page, 1); return ERR_PTR(err); } void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); if (inode && is_inode_flag_set(inode, FI_INC_LINK)) clear_inode_flag(inode, FI_INC_LINK); } int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots) { int bit_start = 0; int zero_start, zero_end; next: zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start); if (zero_start >= max_slots) return max_slots; zero_end = find_next_bit_le(bitmap, max_slots, zero_start); if (zero_end - zero_start >= slots) return zero_start; bit_start = zero_end + 1; if (zero_end + 1 >= max_slots) return max_slots; goto next; } bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, const struct f2fs_filename *fname) { struct f2fs_dentry_ptr d; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(fname->disk_name.len); make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage)); bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); return bit_pos < d.max; } void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct fscrypt_str *name, f2fs_hash_t name_hash, unsigned int bit_pos) { struct f2fs_dir_entry *de; int slots = GET_DENTRY_SLOTS(name->len); int i; de = &d->dentry[bit_pos]; de->hash_code = name_hash; de->name_len = cpu_to_le16(name->len); memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); de->file_type = fs_umode_to_ftype(mode); for (i = 0; i < slots; i++) { __set_bit_le(bit_pos + i, (void *)d->bitmap); /* avoid wrong garbage data for readdir */ if (i) (de + i)->name_len = 0; } } int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; unsigned int level; unsigned int current_depth; unsigned long bidx, block; unsigned int nbucket, nblock; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; int slots, err = 0; level = 0; slots = GET_DENTRY_SLOTS(fname->disk_name.len); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == fname->hash) { level = F2FS_I(dir)->clevel; F2FS_I(dir)->chash = 0; } start: if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) return -ENOSPC; if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) ++current_depth; nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, (le32_to_cpu(fname->hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); dentry_blk = page_address(dentry_page); bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; f2fs_put_page(dentry_page, 1); } /* Move to next level to find the empty slot for new dentry */ ++level; goto start; add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); if (inode) { f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } } make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, bit_pos); set_page_dirty(dentry_page); if (inode) { f2fs_i_pino_write(inode, dir->i_ino); /* synchronize inode page's data from inode cache */ if (is_inode_flag_set(inode, FI_NEW_INODE)) f2fs_update_inode(inode, page); f2fs_put_page(page, 1); } f2fs_update_parent_metadata(dir, inode, current_depth); fail: if (inode) f2fs_up_write(&F2FS_I(inode)->i_sem); f2fs_put_page(dentry_page, 1); return err; } int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode) { int err = -EAGAIN; if (f2fs_has_inline_dentry(dir)) { /* * Should get i_xattr_sem to keep the lock order: * i_xattr_sem -> inode_page lock used by f2fs_setxattr. */ f2fs_down_read(&F2FS_I(dir)->i_xattr_sem); err = f2fs_add_inline_entry(dir, fname, inode, ino, mode); f2fs_up_read(&F2FS_I(dir)->i_xattr_sem); } if (err == -EAGAIN) err = f2fs_add_regular_entry(dir, fname, inode, ino, mode); f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } /* * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_filename fname; struct page *page = NULL; struct f2fs_dir_entry *de = NULL; int err; err = f2fs_setup_filename(dir, name, 0, &fname); if (err) return err; /* * An immature stackable filesystem shows a race condition between lookup * and create. If we have same task when doing lookup and create, it's * definitely fine as expected by VFS normally. Otherwise, let's just * verify on-disk dentry one more time, which guarantees filesystem * consistency more. */ if (current != F2FS_I(dir)->task) { de = __f2fs_find_entry(dir, &fname, &page); F2FS_I(dir)->task = NULL; } if (de) { f2fs_put_page(page, 0); err = -EEXIST; } else if (IS_ERR(page)) { err = PTR_ERR(page); } else { err = f2fs_add_dentry(dir, &fname, inode, ino, mode); } f2fs_free_filename(&fname); return err; } int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, struct f2fs_filename *fname) { struct page *page; int err = 0; f2fs_down_write(&F2FS_I(inode)->i_sem); page = f2fs_init_inode_metadata(inode, dir, fname, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } f2fs_put_page(page, 1); clear_inode_flag(inode, FI_NEW_INODE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: f2fs_up_write(&F2FS_I(inode)->i_sem); return err; } void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); f2fs_down_write(&F2FS_I(inode)->i_sem); if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); inode_set_ctime_current(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { f2fs_i_links_write(inode, false); f2fs_i_size_write(inode, 0); } f2fs_up_write(&F2FS_I(inode)->i_sem); if (inode->i_nlink == 0) f2fs_add_orphan_inode(inode); else f2fs_release_orphan_inode(sbi); } /* * It only removes the dentry from the dentry page, corresponding name * entry in name page does not need to be touched during deletion. */ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); pgoff_t index = page_folio(page)->index; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); if (F2FS_OPTION(F2FS_I_SB(dir)).fsync_mode == FSYNC_MODE_STRICT) f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); lock_page(page); f2fs_wait_on_page_writeback(page, DATA, true, true); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); set_page_dirty(page); if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, index, index + 1)) { f2fs_clear_page_cache_dirty_tag(page_folio(page)); clear_page_dirty_for_io(page); ClearPageUptodate(page); clear_page_private_all(page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); } f2fs_put_page(page, 1); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); } bool f2fs_empty_dir(struct inode *dir) { unsigned long bidx = 0; struct page *dentry_page; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; unsigned long nblock = dir_blocks(dir); if (f2fs_has_inline_dentry(dir)) return f2fs_empty_inline_dir(dir); while (bidx < nblock) { pgoff_t next_pgofs; dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); if (IS_ERR(dentry_page)) { if (PTR_ERR(dentry_page) == -ENOENT) { bidx = next_pgofs; continue; } else { return false; } } dentry_blk = page_address(dentry_page); if (bidx == 0) bit_pos = 2; else bit_pos = 0; bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); f2fs_put_page(dentry_page, 0); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; bidx++; } return true; } int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; bool readdir_ra = sbi->readdir_ra; bool found_valid_dirent = false; int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); if (readdir_ra) blk_start_plug(&plug); while (bit_pos < d->max) { bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); if (bit_pos >= d->max) break; de = &d->dentry[bit_pos]; if (de->name_len == 0) { if (found_valid_dirent || !bit_pos) { f2fs_warn_ratelimited(sbi, "invalid namelen(0), ino:%u, run fsck to fix.", le32_to_cpu(de->ino)); set_sbi_flag(sbi, SBI_NEED_FSCK); } bit_pos++; ctx->pos = start_pos + bit_pos; continue; } d_type = fs_ftype_to_dtype(de->file_type); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); if (unlikely(bit_pos > d->max || le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_warn(sbi, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_DIRENT); goto out; } if (IS_ENCRYPTED(d->inode)) { int save_len = fstr->len; err = fscrypt_fname_disk_to_usr(d->inode, (u32)le32_to_cpu(de->hash_code), 0, &de_name, fstr); if (err) goto out; de_name = *fstr; fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) { err = 1; goto out; } if (readdir_ra) f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); ctx->pos = start_pos + bit_pos; found_valid_dirent = true; } out: if (readdir_ra) blk_finish_plug(&plug); return err; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); struct f2fs_dentry_block *dentry_blk = NULL; struct page *dentry_page = NULL; struct file_ra_state *ra = &file->f_ra; loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; if (IS_ENCRYPTED(inode)) { err = fscrypt_prepare_readdir(inode); if (err) goto out; err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr); if (err < 0) goto out; } if (f2fs_has_inline_dentry(inode)) { err = f2fs_read_inline_dir(file, ctx, &fstr); goto out_free; } for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { pgoff_t next_pgofs; /* allow readdir() to be interrupted */ if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto out_free; } cond_resched(); /* readahead for multi pages of dir */ if (npages - n > 1 && !ra_has_index(ra, n)) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { err = 0; n = next_pgofs; continue; } else { goto out_free; } } dentry_blk = page_address(dentry_page); make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { f2fs_put_page(dentry_page, 0); break; } f2fs_put_page(dentry_page, 0); n++; } out_free: fscrypt_fname_free_buffer(&fstr); out: trace_f2fs_readdir(inode, start_pos, ctx->pos, err); return err < 0 ? err : 0; } const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, #endif };
4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 // SPDX-License-Identifier: GPL-2.0 /* * USB Attached SCSI * Note that this is not the same as the USB Mass Storage driver * * Copyright Hans de Goede <hdegoede@redhat.com> for Red Hat, Inc. 2013 - 2016 * Copyright Matthew Wilcox for Intel Corp, 2010 * Copyright Sarah Sharp for Intel Corp, 2010 */ #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/usb_usual.h> #include <linux/usb/hcd.h> #include <linux/usb/storage.h> #include <linux/usb/uas.h> #include <scsi/scsi.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_devinfo.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include "uas-detect.h" #include "scsiglue.h" #define MAX_CMNDS 256 struct uas_dev_info { struct usb_interface *intf; struct usb_device *udev; struct usb_anchor cmd_urbs; struct usb_anchor sense_urbs; struct usb_anchor data_urbs; u64 flags; int qdepth, resetting; unsigned cmd_pipe, status_pipe, data_in_pipe, data_out_pipe; unsigned use_streams:1; unsigned shutdown:1; struct scsi_cmnd *cmnd[MAX_CMNDS]; spinlock_t lock; struct work_struct work; struct work_struct scan_work; /* for async scanning */ }; enum { SUBMIT_STATUS_URB = BIT(1), ALLOC_DATA_IN_URB = BIT(2), SUBMIT_DATA_IN_URB = BIT(3), ALLOC_DATA_OUT_URB = BIT(4), SUBMIT_DATA_OUT_URB = BIT(5), ALLOC_CMD_URB = BIT(6), SUBMIT_CMD_URB = BIT(7), COMMAND_INFLIGHT = BIT(8), DATA_IN_URB_INFLIGHT = BIT(9), DATA_OUT_URB_INFLIGHT = BIT(10), COMMAND_ABORTED = BIT(11), IS_IN_WORK_LIST = BIT(12), }; /* Overrides scsi_pointer */ struct uas_cmd_info { unsigned int state; unsigned int uas_tag; struct urb *cmd_urb; struct urb *data_in_urb; struct urb *data_out_urb; }; /* I hate forward declarations, but I actually have a loop */ static int uas_submit_urbs(struct scsi_cmnd *cmnd, struct uas_dev_info *devinfo); static void uas_do_work(struct work_struct *work); static int uas_try_complete(struct scsi_cmnd *cmnd, const char *caller); static void uas_free_streams(struct uas_dev_info *devinfo); static void uas_log_cmd_state(struct scsi_cmnd *cmnd, const char *prefix, int status); /* * This driver needs its own workqueue, as we need to control memory allocation. * * In the course of error handling and power management uas_wait_for_pending_cmnds() * needs to flush pending work items. In these contexts we cannot allocate memory * by doing block IO as we would deadlock. For the same reason we cannot wait * for anything allocating memory not heeding these constraints. * * So we have to control all work items that can be on the workqueue we flush. * Hence we cannot share a queue and need our own. */ static struct workqueue_struct *workqueue; static void uas_do_work(struct work_struct *work) { struct uas_dev_info *devinfo = container_of(work, struct uas_dev_info, work); struct uas_cmd_info *cmdinfo; struct scsi_cmnd *cmnd; unsigned long flags; int i, err; spin_lock_irqsave(&devinfo->lock, flags); if (devinfo->resetting) goto out; for (i = 0; i < devinfo->qdepth; i++) { if (!devinfo->cmnd[i]) continue; cmnd = devinfo->cmnd[i]; cmdinfo = scsi_cmd_priv(cmnd); if (!(cmdinfo->state & IS_IN_WORK_LIST)) continue; err = uas_submit_urbs(cmnd, cmnd->device->hostdata); if (!err) cmdinfo->state &= ~IS_IN_WORK_LIST; else queue_work(workqueue, &devinfo->work); } out: spin_unlock_irqrestore(&devinfo->lock, flags); } static void uas_scan_work(struct work_struct *work) { struct uas_dev_info *devinfo = container_of(work, struct uas_dev_info, scan_work); struct Scsi_Host *shost = usb_get_intfdata(devinfo->intf); dev_dbg(&devinfo->intf->dev, "starting scan\n"); scsi_scan_host(shost); dev_dbg(&devinfo->intf->dev, "scan complete\n"); } static void uas_add_work(struct scsi_cmnd *cmnd) { struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); struct uas_dev_info *devinfo = cmnd->device->hostdata; lockdep_assert_held(&devinfo->lock); cmdinfo->state |= IS_IN_WORK_LIST; queue_work(workqueue, &devinfo->work); } static void uas_zap_pending(struct uas_dev_info *devinfo, int result) { struct uas_cmd_info *cmdinfo; struct scsi_cmnd *cmnd; unsigned long flags; int i, err; spin_lock_irqsave(&devinfo->lock, flags); for (i = 0; i < devinfo->qdepth; i++) { if (!devinfo->cmnd[i]) continue; cmnd = devinfo->cmnd[i]; cmdinfo = scsi_cmd_priv(cmnd); uas_log_cmd_state(cmnd, __func__, 0); /* Sense urbs were killed, clear COMMAND_INFLIGHT manually */ cmdinfo->state &= ~COMMAND_INFLIGHT; cmnd->result = result << 16; err = uas_try_complete(cmnd, __func__); WARN_ON(err != 0); } spin_unlock_irqrestore(&devinfo->lock, flags); } static void uas_sense(struct urb *urb, struct scsi_cmnd *cmnd) { struct sense_iu *sense_iu = urb->transfer_buffer; struct scsi_device *sdev = cmnd->device; if (urb->actual_length > 16) { unsigned len = be16_to_cpup(&sense_iu->len); if (len + 16 != urb->actual_length) { int newlen = min(len + 16, urb->actual_length) - 16; if (newlen < 0) newlen = 0; sdev_printk(KERN_INFO, sdev, "%s: urb length %d " "disagrees with IU sense data length %d, " "using %d bytes of sense data\n", __func__, urb->actual_length, len, newlen); len = newlen; } memcpy(cmnd->sense_buffer, sense_iu->sense, len); } cmnd->result = sense_iu->status; } static void uas_log_cmd_state(struct scsi_cmnd *cmnd, const char *prefix, int status) { struct uas_cmd_info *ci = scsi_cmd_priv(cmnd); if (status == -ENODEV) /* too late */ return; scmd_printk(KERN_INFO, cmnd, "%s %d uas-tag %d inflight:%s%s%s%s%s%s%s%s%s%s%s%s ", prefix, status, ci->uas_tag, (ci->state & SUBMIT_STATUS_URB) ? " s-st" : "", (ci->state & ALLOC_DATA_IN_URB) ? " a-in" : "", (ci->state & SUBMIT_DATA_IN_URB) ? " s-in" : "", (ci->state & ALLOC_DATA_OUT_URB) ? " a-out" : "", (ci->state & SUBMIT_DATA_OUT_URB) ? " s-out" : "", (ci->state & ALLOC_CMD_URB) ? " a-cmd" : "", (ci->state & SUBMIT_CMD_URB) ? " s-cmd" : "", (ci->state & COMMAND_INFLIGHT) ? " CMD" : "", (ci->state & DATA_IN_URB_INFLIGHT) ? " IN" : "", (ci->state & DATA_OUT_URB_INFLIGHT) ? " OUT" : "", (ci->state & COMMAND_ABORTED) ? " abort" : "", (ci->state & IS_IN_WORK_LIST) ? " work" : ""); scsi_print_command(cmnd); } static void uas_free_unsubmitted_urbs(struct scsi_cmnd *cmnd) { struct uas_cmd_info *cmdinfo; if (!cmnd) return; cmdinfo = scsi_cmd_priv(cmnd); if (cmdinfo->state & SUBMIT_CMD_URB) usb_free_urb(cmdinfo->cmd_urb); /* data urbs may have never gotten their submit flag set */ if (!(cmdinfo->state & DATA_IN_URB_INFLIGHT)) usb_free_urb(cmdinfo->data_in_urb); if (!(cmdinfo->state & DATA_OUT_URB_INFLIGHT)) usb_free_urb(cmdinfo->data_out_urb); } static int uas_try_complete(struct scsi_cmnd *cmnd, const char *caller) { struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); struct uas_dev_info *devinfo = (void *)cmnd->device->hostdata; lockdep_assert_held(&devinfo->lock); if (cmdinfo->state & (COMMAND_INFLIGHT | DATA_IN_URB_INFLIGHT | DATA_OUT_URB_INFLIGHT | COMMAND_ABORTED)) return -EBUSY; devinfo->cmnd[cmdinfo->uas_tag - 1] = NULL; uas_free_unsubmitted_urbs(cmnd); scsi_done(cmnd); return 0; } static void uas_xfer_data(struct urb *urb, struct scsi_cmnd *cmnd, unsigned direction) { struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); int err; cmdinfo->state |= direction | SUBMIT_STATUS_URB; err = uas_submit_urbs(cmnd, cmnd->device->hostdata); if (err) { uas_add_work(cmnd); } } static bool uas_evaluate_response_iu(struct response_iu *riu, struct scsi_cmnd *cmnd) { u8 response_code = riu->response_code; switch (response_code) { case RC_INCORRECT_LUN: set_host_byte(cmnd, DID_BAD_TARGET); break; case RC_TMF_SUCCEEDED: set_host_byte(cmnd, DID_OK); break; case RC_TMF_NOT_SUPPORTED: set_host_byte(cmnd, DID_BAD_TARGET); break; default: uas_log_cmd_state(cmnd, "response iu", response_code); set_host_byte(cmnd, DID_ERROR); break; } return response_code == RC_TMF_SUCCEEDED; } static void uas_stat_cmplt(struct urb *urb) { struct iu *iu = urb->transfer_buffer; struct Scsi_Host *shost = urb->context; struct uas_dev_info *devinfo = (struct uas_dev_info *)shost->hostdata; struct urb *data_in_urb = NULL; struct urb *data_out_urb = NULL; struct scsi_cmnd *cmnd; struct uas_cmd_info *cmdinfo; unsigned long flags; unsigned int idx; int status = urb->status; bool success; spin_lock_irqsave(&devinfo->lock, flags); if (devinfo->resetting) goto out; if (status) { if (status != -ENOENT && status != -ECONNRESET && status != -ESHUTDOWN) dev_err(&urb->dev->dev, "stat urb: status %d\n", status); goto out; } idx = be16_to_cpup(&iu->tag) - 1; if (idx >= MAX_CMNDS || !devinfo->cmnd[idx]) { dev_err(&urb->dev->dev, "stat urb: no pending cmd for uas-tag %d\n", idx + 1); goto out; } cmnd = devinfo->cmnd[idx]; cmdinfo = scsi_cmd_priv(cmnd); if (!(cmdinfo->state & COMMAND_INFLIGHT)) { uas_log_cmd_state(cmnd, "unexpected status cmplt", 0); goto out; } switch (iu->iu_id) { case IU_ID_STATUS: uas_sense(urb, cmnd); if (cmnd->result != 0) { /* cancel data transfers on error */ data_in_urb = usb_get_urb(cmdinfo->data_in_urb); data_out_urb = usb_get_urb(cmdinfo->data_out_urb); } cmdinfo->state &= ~COMMAND_INFLIGHT; uas_try_complete(cmnd, __func__); break; case IU_ID_READ_READY: if (!cmdinfo->data_in_urb || (cmdinfo->state & DATA_IN_URB_INFLIGHT)) { uas_log_cmd_state(cmnd, "unexpected read rdy", 0); break; } uas_xfer_data(urb, cmnd, SUBMIT_DATA_IN_URB); break; case IU_ID_WRITE_READY: if (!cmdinfo->data_out_urb || (cmdinfo->state & DATA_OUT_URB_INFLIGHT)) { uas_log_cmd_state(cmnd, "unexpected write rdy", 0); break; } uas_xfer_data(urb, cmnd, SUBMIT_DATA_OUT_URB); break; case IU_ID_RESPONSE: cmdinfo->state &= ~COMMAND_INFLIGHT; success = uas_evaluate_response_iu((struct response_iu *)iu, cmnd); if (!success) { /* Error, cancel data transfers */ data_in_urb = usb_get_urb(cmdinfo->data_in_urb); data_out_urb = usb_get_urb(cmdinfo->data_out_urb); } uas_try_complete(cmnd, __func__); break; default: uas_log_cmd_state(cmnd, "bogus IU", iu->iu_id); } out: usb_free_urb(urb); spin_unlock_irqrestore(&devinfo->lock, flags); /* Unlinking of data urbs must be done without holding the lock */ if (data_in_urb) { usb_unlink_urb(data_in_urb); usb_put_urb(data_in_urb); } if (data_out_urb) { usb_unlink_urb(data_out_urb); usb_put_urb(data_out_urb); } } static void uas_data_cmplt(struct urb *urb) { struct scsi_cmnd *cmnd = urb->context; struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); struct uas_dev_info *devinfo = (void *)cmnd->device->hostdata; struct scsi_data_buffer *sdb = &cmnd->sdb; unsigned long flags; int status = urb->status; spin_lock_irqsave(&devinfo->lock, flags); if (cmdinfo->data_in_urb == urb) { cmdinfo->state &= ~DATA_IN_URB_INFLIGHT; cmdinfo->data_in_urb = NULL; } else if (cmdinfo->data_out_urb == urb) { cmdinfo->state &= ~DATA_OUT_URB_INFLIGHT; cmdinfo->data_out_urb = NULL; } if (devinfo->resetting) goto out; /* Data urbs should not complete before the cmd urb is submitted */ if (cmdinfo->state & SUBMIT_CMD_URB) { uas_log_cmd_state(cmnd, "unexpected data cmplt", 0); goto out; } if (status) { if (status != -ENOENT && status != -ECONNRESET && status != -ESHUTDOWN) uas_log_cmd_state(cmnd, "data cmplt err", status); /* error: no data transfered */ scsi_set_resid(cmnd, sdb->length); set_host_byte(cmnd, DID_ERROR); } else { scsi_set_resid(cmnd, sdb->length - urb->actual_length); } uas_try_complete(cmnd, __func__); out: usb_free_urb(urb); spin_unlock_irqrestore(&devinfo->lock, flags); } static void uas_cmd_cmplt(struct urb *urb) { if (urb->status) dev_err(&urb->dev->dev, "cmd cmplt err %d\n", urb->status); usb_free_urb(urb); } static struct urb *uas_alloc_data_urb(struct uas_dev_info *devinfo, gfp_t gfp, struct scsi_cmnd *cmnd, enum dma_data_direction dir) { struct usb_device *udev = devinfo->udev; struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); struct urb *urb = usb_alloc_urb(0, gfp); struct scsi_data_buffer *sdb = &cmnd->sdb; unsigned int pipe = (dir == DMA_FROM_DEVICE) ? devinfo->data_in_pipe : devinfo->data_out_pipe; if (!urb) goto out; usb_fill_bulk_urb(urb, udev, pipe, NULL, sdb->length, uas_data_cmplt, cmnd); if (devinfo->use_streams) urb->stream_id = cmdinfo->uas_tag; urb->num_sgs = udev->bus->sg_tablesize ? sdb->table.nents : 0; urb->sg = sdb->table.sgl; out: return urb; } static struct urb *uas_alloc_sense_urb(struct uas_dev_info *devinfo, gfp_t gfp, struct scsi_cmnd *cmnd) { struct usb_device *udev = devinfo->udev; struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); struct urb *urb = usb_alloc_urb(0, gfp); struct sense_iu *iu; if (!urb) goto out; iu = kzalloc(sizeof(*iu), gfp); if (!iu) goto free; usb_fill_bulk_urb(urb, udev, devinfo->status_pipe, iu, sizeof(*iu), uas_stat_cmplt, cmnd->device->host); if (devinfo->use_streams) urb->stream_id = cmdinfo->uas_tag; urb->transfer_flags |= URB_FREE_BUFFER; out: return urb; free: usb_free_urb(urb); return NULL; } static struct urb *uas_alloc_cmd_urb(struct uas_dev_info *devinfo, gfp_t gfp, struct scsi_cmnd *cmnd) { struct usb_device *udev = devinfo->udev; struct scsi_device *sdev = cmnd->device; struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); struct urb *urb = usb_alloc_urb(0, gfp); struct command_iu *iu; int len; if (!urb) goto out; len = cmnd->cmd_len - 16; if (len < 0) len = 0; len = ALIGN(len, 4); iu = kzalloc(sizeof(*iu) + len, gfp); if (!iu) goto free; iu->iu_id = IU_ID_COMMAND; iu->tag = cpu_to_be16(cmdinfo->uas_tag); iu->prio_attr = UAS_SIMPLE_TAG; iu->len = len; int_to_scsilun(sdev->lun, &iu->lun); memcpy(iu->cdb, cmnd->cmnd, cmnd->cmd_len); usb_fill_bulk_urb(urb, udev, devinfo->cmd_pipe, iu, sizeof(*iu) + len, uas_cmd_cmplt, NULL); urb->transfer_flags |= URB_FREE_BUFFER; out: return urb; free: usb_free_urb(urb); return NULL; } /* * Why should I request the Status IU before sending the Command IU? Spec * says to, but also says the device may receive them in any order. Seems * daft to me. */ static int uas_submit_sense_urb(struct scsi_cmnd *cmnd, gfp_t gfp) { struct uas_dev_info *devinfo = cmnd->device->hostdata; struct urb *urb; int err; urb = uas_alloc_sense_urb(devinfo, gfp, cmnd); if (!urb) return -ENOMEM; usb_anchor_urb(urb, &devinfo->sense_urbs); err = usb_submit_urb(urb, gfp); if (err) { usb_unanchor_urb(urb); uas_log_cmd_state(cmnd, "sense submit err", err); usb_free_urb(urb); } return err; } static int uas_submit_urbs(struct scsi_cmnd *cmnd, struct uas_dev_info *devinfo) { struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); int err; lockdep_assert_held(&devinfo->lock); if (cmdinfo->state & SUBMIT_STATUS_URB) { err = uas_submit_sense_urb(cmnd, GFP_ATOMIC); if (err) return err; cmdinfo->state &= ~SUBMIT_STATUS_URB; } if (cmdinfo->state & ALLOC_DATA_IN_URB) { cmdinfo->data_in_urb = uas_alloc_data_urb(devinfo, GFP_ATOMIC, cmnd, DMA_FROM_DEVICE); if (!cmdinfo->data_in_urb) return -ENOMEM; cmdinfo->state &= ~ALLOC_DATA_IN_URB; } if (cmdinfo->state & SUBMIT_DATA_IN_URB) { usb_anchor_urb(cmdinfo->data_in_urb, &devinfo->data_urbs); err = usb_submit_urb(cmdinfo->data_in_urb, GFP_ATOMIC); if (err) { usb_unanchor_urb(cmdinfo->data_in_urb); uas_log_cmd_state(cmnd, "data in submit err", err); return err; } cmdinfo->state &= ~SUBMIT_DATA_IN_URB; cmdinfo->state |= DATA_IN_URB_INFLIGHT; } if (cmdinfo->state & ALLOC_DATA_OUT_URB) { cmdinfo->data_out_urb = uas_alloc_data_urb(devinfo, GFP_ATOMIC, cmnd, DMA_TO_DEVICE); if (!cmdinfo->data_out_urb) return -ENOMEM; cmdinfo->state &= ~ALLOC_DATA_OUT_URB; } if (cmdinfo->state & SUBMIT_DATA_OUT_URB) { usb_anchor_urb(cmdinfo->data_out_urb, &devinfo->data_urbs); err = usb_submit_urb(cmdinfo->data_out_urb, GFP_ATOMIC); if (err) { usb_unanchor_urb(cmdinfo->data_out_urb); uas_log_cmd_state(cmnd, "data out submit err", err); return err; } cmdinfo->state &= ~SUBMIT_DATA_OUT_URB; cmdinfo->state |= DATA_OUT_URB_INFLIGHT; } if (cmdinfo->state & ALLOC_CMD_URB) { cmdinfo->cmd_urb = uas_alloc_cmd_urb(devinfo, GFP_ATOMIC, cmnd); if (!cmdinfo->cmd_urb) return -ENOMEM; cmdinfo->state &= ~ALLOC_CMD_URB; } if (cmdinfo->state & SUBMIT_CMD_URB) { usb_anchor_urb(cmdinfo->cmd_urb, &devinfo->cmd_urbs); err = usb_submit_urb(cmdinfo->cmd_urb, GFP_ATOMIC); if (err) { usb_unanchor_urb(cmdinfo->cmd_urb); uas_log_cmd_state(cmnd, "cmd submit err", err); return err; } cmdinfo->cmd_urb = NULL; cmdinfo->state &= ~SUBMIT_CMD_URB; cmdinfo->state |= COMMAND_INFLIGHT; } return 0; } static int uas_queuecommand_lck(struct scsi_cmnd *cmnd) { struct scsi_device *sdev = cmnd->device; struct uas_dev_info *devinfo = sdev->hostdata; struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); unsigned long flags; int idx, err; /* Re-check scsi_block_requests now that we've the host-lock */ if (cmnd->device->host->host_self_blocked) return SCSI_MLQUEUE_DEVICE_BUSY; if ((devinfo->flags & US_FL_NO_ATA_1X) && (cmnd->cmnd[0] == ATA_12 || cmnd->cmnd[0] == ATA_16)) { memcpy(cmnd->sense_buffer, usb_stor_sense_invalidCDB, sizeof(usb_stor_sense_invalidCDB)); cmnd->result = SAM_STAT_CHECK_CONDITION; scsi_done(cmnd); return 0; } spin_lock_irqsave(&devinfo->lock, flags); if (devinfo->resetting) { set_host_byte(cmnd, DID_ERROR); scsi_done(cmnd); goto zombie; } /* Find a free uas-tag */ for (idx = 0; idx < devinfo->qdepth; idx++) { if (!devinfo->cmnd[idx]) break; } if (idx == devinfo->qdepth) { spin_unlock_irqrestore(&devinfo->lock, flags); return SCSI_MLQUEUE_DEVICE_BUSY; } memset(cmdinfo, 0, sizeof(*cmdinfo)); cmdinfo->uas_tag = idx + 1; /* uas-tag == usb-stream-id, so 1 based */ cmdinfo->state = SUBMIT_STATUS_URB | ALLOC_CMD_URB | SUBMIT_CMD_URB; switch (cmnd->sc_data_direction) { case DMA_FROM_DEVICE: cmdinfo->state |= ALLOC_DATA_IN_URB | SUBMIT_DATA_IN_URB; break; case DMA_BIDIRECTIONAL: cmdinfo->state |= ALLOC_DATA_IN_URB | SUBMIT_DATA_IN_URB; fallthrough; case DMA_TO_DEVICE: cmdinfo->state |= ALLOC_DATA_OUT_URB | SUBMIT_DATA_OUT_URB; break; case DMA_NONE: break; } if (!devinfo->use_streams) cmdinfo->state &= ~(SUBMIT_DATA_IN_URB | SUBMIT_DATA_OUT_URB); err = uas_submit_urbs(cmnd, devinfo); /* * in case of fatal errors the SCSI layer is peculiar * a command that has finished is a success for the purpose * of queueing, no matter how fatal the error */ if (err == -ENODEV) { set_host_byte(cmnd, DID_NO_CONNECT); scsi_done(cmnd); goto zombie; } if (err) { /* If we did nothing, give up now */ if (cmdinfo->state & SUBMIT_STATUS_URB) { spin_unlock_irqrestore(&devinfo->lock, flags); return SCSI_MLQUEUE_DEVICE_BUSY; } uas_add_work(cmnd); } devinfo->cmnd[idx] = cmnd; zombie: spin_unlock_irqrestore(&devinfo->lock, flags); return 0; } static DEF_SCSI_QCMD(uas_queuecommand) /* * For now we do not support actually sending an abort to the device, so * this eh always fails. Still we must define it to make sure that we've * dropped all references to the cmnd in question once this function exits. */ static int uas_eh_abort_handler(struct scsi_cmnd *cmnd) { struct uas_cmd_info *cmdinfo = scsi_cmd_priv(cmnd); struct uas_dev_info *devinfo = (void *)cmnd->device->hostdata; struct urb *data_in_urb = NULL; struct urb *data_out_urb = NULL; unsigned long flags; spin_lock_irqsave(&devinfo->lock, flags); uas_log_cmd_state(cmnd, __func__, 0); /* Ensure that try_complete does not call scsi_done */ cmdinfo->state |= COMMAND_ABORTED; /* Drop all refs to this cmnd, kill data urbs to break their ref */ devinfo->cmnd[cmdinfo->uas_tag - 1] = NULL; if (cmdinfo->state & DATA_IN_URB_INFLIGHT) data_in_urb = usb_get_urb(cmdinfo->data_in_urb); if (cmdinfo->state & DATA_OUT_URB_INFLIGHT) data_out_urb = usb_get_urb(cmdinfo->data_out_urb); uas_free_unsubmitted_urbs(cmnd); spin_unlock_irqrestore(&devinfo->lock, flags); if (data_in_urb) { usb_kill_urb(data_in_urb); usb_put_urb(data_in_urb); } if (data_out_urb) { usb_kill_urb(data_out_urb); usb_put_urb(data_out_urb); } return FAILED; } static int uas_eh_device_reset_handler(struct scsi_cmnd *cmnd) { struct scsi_device *sdev = cmnd->device; struct uas_dev_info *devinfo = sdev->hostdata; struct usb_device *udev = devinfo->udev; unsigned long flags; int err; err = usb_lock_device_for_reset(udev, devinfo->intf); if (err) { shost_printk(KERN_ERR, sdev->host, "%s FAILED to get lock err %d\n", __func__, err); return FAILED; } shost_printk(KERN_INFO, sdev->host, "%s start\n", __func__); spin_lock_irqsave(&devinfo->lock, flags); devinfo->resetting = 1; spin_unlock_irqrestore(&devinfo->lock, flags); usb_kill_anchored_urbs(&devinfo->cmd_urbs); usb_kill_anchored_urbs(&devinfo->sense_urbs); usb_kill_anchored_urbs(&devinfo->data_urbs); uas_zap_pending(devinfo, DID_RESET); err = usb_reset_device(udev); spin_lock_irqsave(&devinfo->lock, flags); devinfo->resetting = 0; spin_unlock_irqrestore(&devinfo->lock, flags); usb_unlock_device(udev); if (err) { shost_printk(KERN_INFO, sdev->host, "%s FAILED err %d\n", __func__, err); return FAILED; } shost_printk(KERN_INFO, sdev->host, "%s success\n", __func__); return SUCCESS; } static int uas_target_alloc(struct scsi_target *starget) { struct uas_dev_info *devinfo = (struct uas_dev_info *) dev_to_shost(starget->dev.parent)->hostdata; if (devinfo->flags & US_FL_NO_REPORT_LUNS) starget->no_report_luns = 1; return 0; } static int uas_sdev_init(struct scsi_device *sdev) { struct uas_dev_info *devinfo = (struct uas_dev_info *)sdev->host->hostdata; /* * Some USB storage devices reset if the IO advice hints grouping mode * page is queried. Hence skip that mode page. */ sdev->sdev_bflags |= BLIST_SKIP_IO_HINTS; sdev->hostdata = devinfo; return 0; } static int uas_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) { struct uas_dev_info *devinfo = sdev->hostdata; if (devinfo->flags & US_FL_MAX_SECTORS_64) lim->max_hw_sectors = 64; else if (devinfo->flags & US_FL_MAX_SECTORS_240) lim->max_hw_sectors = 240; if (devinfo->flags & US_FL_NO_REPORT_OPCODES) sdev->no_report_opcodes = 1; /* A few buggy USB-ATA bridges don't understand FUA */ if (devinfo->flags & US_FL_BROKEN_FUA) sdev->broken_fua = 1; /* UAS also needs to support FL_ALWAYS_SYNC */ if (devinfo->flags & US_FL_ALWAYS_SYNC) { sdev->skip_ms_page_3f = 1; sdev->skip_ms_page_8 = 1; sdev->wce_default_on = 1; } /* Some disks cannot handle READ_CAPACITY_16 */ if (devinfo->flags & US_FL_NO_READ_CAPACITY_16) sdev->no_read_capacity_16 = 1; /* Some disks cannot handle WRITE_SAME */ if (devinfo->flags & US_FL_NO_SAME) sdev->no_write_same = 1; /* * Some disks return the total number of blocks in response * to READ CAPACITY rather than the highest block number. * If this device makes that mistake, tell the sd driver. */ if (devinfo->flags & US_FL_FIX_CAPACITY) sdev->fix_capacity = 1; /* * in some cases we have to guess */ if (devinfo->flags & US_FL_CAPACITY_HEURISTICS) sdev->guess_capacity = 1; /* * Some devices report generic values until the media has been * accessed. Force a READ(10) prior to querying device * characteristics. */ sdev->read_before_ms = 1; /* * Some devices don't like MODE SENSE with page=0x3f, * which is the command used for checking if a device * is write-protected. Now that we tell the sd driver * to do a 192-byte transfer with this command the * majority of devices work fine, but a few still can't * handle it. The sd driver will simply assume those * devices are write-enabled. */ if (devinfo->flags & US_FL_NO_WP_DETECT) sdev->skip_ms_page_3f = 1; scsi_change_queue_depth(sdev, devinfo->qdepth - 2); return 0; } static const struct scsi_host_template uas_host_template = { .module = THIS_MODULE, .name = "uas", .queuecommand = uas_queuecommand, .target_alloc = uas_target_alloc, .sdev_init = uas_sdev_init, .sdev_configure = uas_sdev_configure, .eh_abort_handler = uas_eh_abort_handler, .eh_device_reset_handler = uas_eh_device_reset_handler, .this_id = -1, .skip_settle_delay = 1, /* * The protocol has no requirements on alignment in the strict sense. * Controllers may or may not have alignment restrictions. * As this is not exported, we use an extremely conservative guess. */ .dma_alignment = 511, .dma_boundary = PAGE_SIZE - 1, .cmd_size = sizeof(struct uas_cmd_info), }; #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \ vendorName, productName, useProtocol, useTransport, \ initFunction, flags) \ { USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \ .driver_info = (flags) } static const struct usb_device_id uas_usb_ids[] = { # include "unusual_uas.h" { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, USB_SC_SCSI, USB_PR_BULK) }, { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, USB_SC_SCSI, USB_PR_UAS) }, { } }; MODULE_DEVICE_TABLE(usb, uas_usb_ids); #undef UNUSUAL_DEV static int uas_switch_interface(struct usb_device *udev, struct usb_interface *intf) { struct usb_host_interface *alt; alt = uas_find_uas_alt_setting(intf); if (!alt) return -ENODEV; return usb_set_interface(udev, alt->desc.bInterfaceNumber, alt->desc.bAlternateSetting); } static int uas_configure_endpoints(struct uas_dev_info *devinfo) { struct usb_host_endpoint *eps[4] = { }; struct usb_device *udev = devinfo->udev; int r; r = uas_find_endpoints(devinfo->intf->cur_altsetting, eps); if (r) return r; devinfo->cmd_pipe = usb_sndbulkpipe(udev, usb_endpoint_num(&eps[0]->desc)); devinfo->status_pipe = usb_rcvbulkpipe(udev, usb_endpoint_num(&eps[1]->desc)); devinfo->data_in_pipe = usb_rcvbulkpipe(udev, usb_endpoint_num(&eps[2]->desc)); devinfo->data_out_pipe = usb_sndbulkpipe(udev, usb_endpoint_num(&eps[3]->desc)); if (udev->speed < USB_SPEED_SUPER) { devinfo->qdepth = 32; devinfo->use_streams = 0; } else { devinfo->qdepth = usb_alloc_streams(devinfo->intf, eps + 1, 3, MAX_CMNDS, GFP_NOIO); if (devinfo->qdepth < 0) return devinfo->qdepth; devinfo->use_streams = 1; } return 0; } static void uas_free_streams(struct uas_dev_info *devinfo) { struct usb_device *udev = devinfo->udev; struct usb_host_endpoint *eps[3]; eps[0] = usb_pipe_endpoint(udev, devinfo->status_pipe); eps[1] = usb_pipe_endpoint(udev, devinfo->data_in_pipe); eps[2] = usb_pipe_endpoint(udev, devinfo->data_out_pipe); usb_free_streams(devinfo->intf, eps, 3, GFP_NOIO); } static int uas_probe(struct usb_interface *intf, const struct usb_device_id *id) { int result = -ENOMEM; struct Scsi_Host *shost = NULL; struct uas_dev_info *devinfo; struct usb_device *udev = interface_to_usbdev(intf); u64 dev_flags; if (!uas_use_uas_driver(intf, id, &dev_flags)) return -ENODEV; if (uas_switch_interface(udev, intf)) return -ENODEV; shost = scsi_host_alloc(&uas_host_template, sizeof(struct uas_dev_info)); if (!shost) goto set_alt0; shost->max_cmd_len = 16 + 252; shost->max_id = 1; shost->max_lun = 256; shost->max_channel = 0; shost->sg_tablesize = udev->bus->sg_tablesize; devinfo = (struct uas_dev_info *)shost->hostdata; devinfo->intf = intf; devinfo->udev = udev; devinfo->resetting = 0; devinfo->shutdown = 0; devinfo->flags = dev_flags; init_usb_anchor(&devinfo->cmd_urbs); init_usb_anchor(&devinfo->sense_urbs); init_usb_anchor(&devinfo->data_urbs); spin_lock_init(&devinfo->lock); INIT_WORK(&devinfo->work, uas_do_work); INIT_WORK(&devinfo->scan_work, uas_scan_work); result = uas_configure_endpoints(devinfo); if (result) goto set_alt0; /* * 1 tag is reserved for untagged commands + * 1 tag to avoid off by one errors in some bridge firmwares */ shost->can_queue = devinfo->qdepth - 2; usb_set_intfdata(intf, shost); result = scsi_add_host(shost, &intf->dev); if (result) goto free_streams; /* Submit the delayed_work for SCSI-device scanning */ schedule_work(&devinfo->scan_work); return result; free_streams: uas_free_streams(devinfo); usb_set_intfdata(intf, NULL); set_alt0: usb_set_interface(udev, intf->altsetting[0].desc.bInterfaceNumber, 0); if (shost) scsi_host_put(shost); return result; } static int uas_cmnd_list_empty(struct uas_dev_info *devinfo) { unsigned long flags; int i, r = 1; spin_lock_irqsave(&devinfo->lock, flags); for (i = 0; i < devinfo->qdepth; i++) { if (devinfo->cmnd[i]) { r = 0; /* Not empty */ break; } } spin_unlock_irqrestore(&devinfo->lock, flags); return r; } /* * Wait for any pending cmnds to complete, on usb-2 sense_urbs may temporarily * get empty while there still is more work to do due to sense-urbs completing * with a READ/WRITE_READY iu code, so keep waiting until the list gets empty. */ static int uas_wait_for_pending_cmnds(struct uas_dev_info *devinfo) { unsigned long start_time; int r; start_time = jiffies; do { flush_work(&devinfo->work); r = usb_wait_anchor_empty_timeout(&devinfo->sense_urbs, 5000); if (r == 0) return -ETIME; r = usb_wait_anchor_empty_timeout(&devinfo->data_urbs, 500); if (r == 0) return -ETIME; if (time_after(jiffies, start_time + 5 * HZ)) return -ETIME; } while (!uas_cmnd_list_empty(devinfo)); return 0; } static int uas_pre_reset(struct usb_interface *intf) { struct Scsi_Host *shost = usb_get_intfdata(intf); struct uas_dev_info *devinfo = (struct uas_dev_info *)shost->hostdata; unsigned long flags; if (devinfo->shutdown) return 0; /* Block new requests */ spin_lock_irqsave(shost->host_lock, flags); scsi_block_requests(shost); spin_unlock_irqrestore(shost->host_lock, flags); if (uas_wait_for_pending_cmnds(devinfo) != 0) { shost_printk(KERN_ERR, shost, "%s: timed out\n", __func__); scsi_unblock_requests(shost); return 1; } uas_free_streams(devinfo); return 0; } static int uas_post_reset(struct usb_interface *intf) { struct Scsi_Host *shost = usb_get_intfdata(intf); struct uas_dev_info *devinfo = (struct uas_dev_info *)shost->hostdata; unsigned long flags; int err; if (devinfo->shutdown) return 0; err = uas_configure_endpoints(devinfo); if (err && err != -ENODEV) shost_printk(KERN_ERR, shost, "%s: alloc streams error %d after reset", __func__, err); /* we must unblock the host in every case lest we deadlock */ spin_lock_irqsave(shost->host_lock, flags); scsi_report_bus_reset(shost, 0); spin_unlock_irqrestore(shost->host_lock, flags); scsi_unblock_requests(shost); return err ? 1 : 0; } static int uas_suspend(struct usb_interface *intf, pm_message_t message) { struct Scsi_Host *shost = usb_get_intfdata(intf); struct uas_dev_info *devinfo = (struct uas_dev_info *)shost->hostdata; if (uas_wait_for_pending_cmnds(devinfo) != 0) { shost_printk(KERN_ERR, shost, "%s: timed out\n", __func__); return -ETIME; } return 0; } static int uas_resume(struct usb_interface *intf) { return 0; } static int uas_reset_resume(struct usb_interface *intf) { struct Scsi_Host *shost = usb_get_intfdata(intf); struct uas_dev_info *devinfo = (struct uas_dev_info *)shost->hostdata; unsigned long flags; int err; err = uas_configure_endpoints(devinfo); if (err) { shost_printk(KERN_ERR, shost, "%s: alloc streams error %d after reset", __func__, err); return -EIO; } spin_lock_irqsave(shost->host_lock, flags); scsi_report_bus_reset(shost, 0); spin_unlock_irqrestore(shost->host_lock, flags); return 0; } static void uas_disconnect(struct usb_interface *intf) { struct Scsi_Host *shost = usb_get_intfdata(intf); struct uas_dev_info *devinfo = (struct uas_dev_info *)shost->hostdata; unsigned long flags; spin_lock_irqsave(&devinfo->lock, flags); devinfo->resetting = 1; spin_unlock_irqrestore(&devinfo->lock, flags); cancel_work_sync(&devinfo->work); usb_kill_anchored_urbs(&devinfo->cmd_urbs); usb_kill_anchored_urbs(&devinfo->sense_urbs); usb_kill_anchored_urbs(&devinfo->data_urbs); uas_zap_pending(devinfo, DID_NO_CONNECT); /* * Prevent SCSI scanning (if it hasn't started yet) * or wait for the SCSI-scanning routine to stop. */ cancel_work_sync(&devinfo->scan_work); scsi_remove_host(shost); uas_free_streams(devinfo); scsi_host_put(shost); } /* * Put the device back in usb-storage mode on shutdown, as some BIOS-es * hang on reboot when the device is still in uas mode. Note the reset is * necessary as some devices won't revert to usb-storage mode without it. */ static void uas_shutdown(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct Scsi_Host *shost = usb_get_intfdata(intf); struct uas_dev_info *devinfo = (struct uas_dev_info *)shost->hostdata; if (system_state != SYSTEM_RESTART) return; devinfo->shutdown = 1; uas_free_streams(devinfo); usb_set_interface(udev, intf->altsetting[0].desc.bInterfaceNumber, 0); usb_reset_device(udev); } static struct usb_driver uas_driver = { .name = "uas", .probe = uas_probe, .disconnect = uas_disconnect, .pre_reset = uas_pre_reset, .post_reset = uas_post_reset, .suspend = uas_suspend, .resume = uas_resume, .reset_resume = uas_reset_resume, .shutdown = uas_shutdown, .id_table = uas_usb_ids, }; static int __init uas_init(void) { int rv; workqueue = alloc_workqueue("uas", WQ_MEM_RECLAIM, 0); if (!workqueue) return -ENOMEM; rv = usb_register(&uas_driver); if (rv) { destroy_workqueue(workqueue); return -ENOMEM; } return 0; } static void __exit uas_exit(void) { usb_deregister(&uas_driver); destroy_workqueue(workqueue); } module_init(uas_init); module_exit(uas_exit); MODULE_DESCRIPTION("USB Attached SCSI driver"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("USB_STORAGE"); MODULE_AUTHOR( "Hans de Goede <hdegoede@redhat.com>, Matthew Wilcox and Sarah Sharp");
490 487 488 108 108 54 54 489 486 473 12 469 470 18 18 18 17 6 18 17 18 18 18 18 17 17 6 18 18 18 18 18 18 18 547 11 541 545 545 542 13 539 546 15 530 521 526 7 6 523 517 511 229 11 298 103 227 522 521 24 512 6 503 503 38 27 485 551 22 41 506 509 18 18 44 44 44 44 18 18 18 18 43 43 450 457 450 456 448 450 383 383 381 380 378 196 3 450 46 46 450 445 28 437 402 11 6 5 6 5 3 453 453 457 223 223 220 222 2 1 1 1 1 1 220 221 2 222 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 input * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Ian P. Morris <I.P.Morris@soton.ac.uk> * * Based in linux/net/ipv4/ip_input.c */ /* Changes * * Mitsuru KANDA @USAGI and * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> #include <linux/slab.h> #include <linux/indirect_call_wrapper.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/sock.h> #include <net/snmp.h> #include <net/udp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/inet_ecn.h> #include <net/dst_metadata.h> static void ip6_rcv_finish_core(struct net *net, struct sock *sk, struct sk_buff *skb) { if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk) { switch (ipv6_hdr(skb)->nexthdr) { case IPPROTO_TCP: if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) tcp_v6_early_demux(skb); break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) udp_v6_early_demux(skb); break; } } if (!skb_valid_dst(skb)) ip6_route_input(skb); } int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_rcv(skb); if (!skb) return NET_RX_SUCCESS; ip6_rcv_finish_core(net, sk, skb); return dst_input(skb); } static void ip6_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); dst_input(skb); } } static bool ip6_can_use_hint(const struct sk_buff *skb, const struct sk_buff *hint) { return hint && !skb_dst(skb) && ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr); } static struct sk_buff *ip6_extract_route_hint(const struct net *net, struct sk_buff *skb) { if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) || IP6CB(skb)->flags & IP6SKB_MULTIPATH) return NULL; return skb; } static void ip6_list_rcv_finish(struct net *net, struct sock *sk, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_rcv(skb); if (!skb) continue; if (ip6_can_use_hint(skb, hint)) skb_dst_copy(skb, hint); else ip6_rcv_finish_core(net, sk, skb); dst = skb_dst(skb); if (curr_dst != dst) { hint = ip6_extract_route_hint(net, skb); /* dispatch old sublist */ if (!list_empty(&sublist)) ip6_sublist_rcv_finish(&sublist); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dst = dst; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ ip6_sublist_rcv_finish(&sublist); } static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, struct net *net) { enum skb_drop_reason reason; const struct ipv6hdr *hdr; u32 pkt_len; struct inet6_dev *idev; if (skb->pkt_type == PACKET_OTHERHOST) { dev_core_stats_rx_otherhost_dropped_inc(skb->dev); kfree_skb_reason(skb, SKB_DROP_REASON_OTHERHOST); return NULL; } rcu_read_lock(); idev = __in6_dev_get(skb->dev); __IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len); SKB_DR_SET(reason, NOT_SPECIFIED); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL || !idev || unlikely(READ_ONCE(idev->cnf.disable_ipv6))) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); if (idev && unlikely(READ_ONCE(idev->cnf.disable_ipv6))) SKB_DR_SET(reason, IPV6DISABLED); goto drop; } memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); /* * Store incoming device index. When the packet will * be queued, we cannot refer to skb->dev anymore. * * BTW, when we send a packet for our own local address on a * non-loopback interface (e.g. ethX), it is being delivered * via the loopback interface (lo) here; skb->dev = loopback_dev. * It, however, should be considered as if it is being * arrived via the sending interface (ethX), because of the * nature of scoping architecture. --yoshfuji */ IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) goto err; hdr = ipv6_hdr(skb); if (hdr->version != 6) { SKB_DR_SET(reason, UNHANDLED_PROTO); goto err; } __IP6_ADD_STATS(net, idev, IPSTATS_MIB_NOECTPKTS + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); /* * RFC4291 2.5.3 * The loopback address must not be used as the source address in IPv6 * packets that are sent outside of a single node. [..] * A packet received on an interface with a destination address * of loopback must be dropped. */ if ((ipv6_addr_loopback(&hdr->saddr) || ipv6_addr_loopback(&hdr->daddr)) && !(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev)) goto err; /* RFC4291 Errata ID: 3480 * Interface-Local scope spans only a single interface on a * node and is useful only for loopback transmission of * multicast. Packets with interface-local scope received * from another node must be discarded. */ if (!(skb->pkt_type == PACKET_LOOPBACK || dev->flags & IFF_LOOPBACK) && ipv6_addr_is_multicast(&hdr->daddr) && IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1) goto err; /* If enabled, drop unicast packets that were encapsulated in link-layer * multicast or broadcast to protected against the so-called "hole-196" * attack in 802.11 wireless. */ if (!ipv6_addr_is_multicast(&hdr->daddr) && (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) && READ_ONCE(idev->cnf.drop_unicast_in_l2_multicast)) { SKB_DR_SET(reason, UNICAST_IN_L2_MULTICAST); goto err; } /* RFC4291 2.7 * Nodes must not originate a packet to a multicast address whose scope * field contains the reserved value 0; if such a packet is received, it * must be silently dropped. */ if (ipv6_addr_is_multicast(&hdr->daddr) && IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 0) goto err; /* * RFC4291 2.7 * Multicast addresses must not be used as source addresses in IPv6 * packets or appear in any Routing header. */ if (ipv6_addr_is_multicast(&hdr->saddr)) goto err; skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); pkt_len = ntohs(hdr->payload_len); /* pkt_len may be zero if Jumbo payload option is present */ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + sizeof(struct ipv6hdr) > skb->len) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS); SKB_DR_SET(reason, PKT_TOO_SMALL); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto err; hdr = ipv6_hdr(skb); } if (hdr->nexthdr == NEXTHDR_HOP) { if (ipv6_parse_hopopts(skb) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); rcu_read_unlock(); return NULL; } } rcu_read_unlock(); /* Must drop socket now because of tproxy. */ if (!skb_sk_is_prefetched(skb)) skb_orphan(skb); return skb; err: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); SKB_DR_OR(reason, IP_INHDR); drop: rcu_read_unlock(); kfree_skb_reason(skb, reason); return NULL; } int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(skb->dev); skb = ip6_rcv_core(skb, dev, net); if (skb == NULL) return NET_RX_DROP; return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip6_rcv_finish); } static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev, struct net *net) { NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip6_rcv_finish); ip6_list_rcv_finish(net, NULL, head); } /* Receive a list of IPv6 packets */ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev) { struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); skb_list_del_init(skb); skb = ip6_rcv_core(skb, dev, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ if (!list_empty(&sublist)) ip6_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ if (!list_empty(&sublist)) ip6_sublist_rcv(&sublist, curr_dev, curr_net); } INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *)); /* * Deliver the packet to the host */ void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final) { const struct inet6_protocol *ipprot; struct inet6_dev *idev; unsigned int nhoff; SKB_DR(reason); bool raw; /* * Parse extension headers */ resubmit: idev = ip6_dst_idev(skb_dst(skb)); nhoff = IP6CB(skb)->nhoff; if (!have_final) { if (!pskb_pull(skb, skb_transport_offset(skb))) goto discard; nexthdr = skb_network_header(skb)[nhoff]; } resubmit_final: raw = raw6_local_deliver(skb, nexthdr); ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot) { int ret; if (have_final) { if (!(ipprot->flags & INET6_PROTO_FINAL)) { /* Once we've seen a final protocol don't * allow encapsulation on any non-final * ones. This allows foo in UDP encapsulation * to work. */ goto discard; } } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; int sdif = inet6_sdif(skb); struct net_device *dev; /* Only do this once for first final protocol */ have_final = true; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); /* skb->dev passed may be master dev for vrfs. */ if (sdif) { dev = dev_get_by_index_rcu(net, sdif); if (!dev) goto discard; } else { dev = skb->dev; } if (ipv6_addr_is_multicast(&hdr->daddr) && !ipv6_chk_mcast_addr(dev, &hdr->daddr, &hdr->saddr) && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) { SKB_DR_SET(reason, IP_INADDRERRORS); goto discard; } } if (!(ipprot->flags & INET6_PROTO_NOPOLICY)) { if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { SKB_DR_SET(reason, XFRM_POLICY); goto discard; } nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v6_rcv, udpv6_rcv, skb); if (ret > 0) { if (ipprot->flags & INET6_PROTO_FINAL) { /* Not an extension header, most likely UDP * encapsulation. Use return value as nexthdr * protocol not nhoff (which presumably is * not set by handler). */ nexthdr = ret; goto resubmit_final; } else { goto resubmit; } } else if (ret == 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS); } } else { if (!raw) { if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INUNKNOWNPROTOS); icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR, nhoff); SKB_DR_SET(reason, IP_NOPROTO); } else { SKB_DR_SET(reason, XFRM_POLICY); } kfree_skb_reason(skb, reason); } else { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } return; discard: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); kfree_skb_reason(skb, reason); } static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); rcu_read_lock(); ip6_protocol_deliver_rcu(net, skb, 0, false); rcu_read_unlock(); return 0; } int ip6_input(struct sk_buff *skb) { return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, dev_net(skb->dev), NULL, skb, skb->dev, NULL, ip6_input_finish); } EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); /* skb->dev passed may be master dev for vrfs. */ if (sdif) { rcu_read_lock(); dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); if (!dev) { rcu_read_unlock(); kfree_skb(skb); return -ENODEV; } } else { dev = skb->dev; } hdr = ipv6_hdr(skb); deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL); if (sdif) rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* * IPv6 multicast router mode is now supported ;) */ if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) && !(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { /* * Okay, we try to forward - split and duplicate * packets. */ struct sk_buff *skb2; struct inet6_skb_parm *opt = IP6CB(skb); /* Check for MLD */ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { /* Check if this is a mld message */ u8 nexthdr = hdr->nexthdr; __be16 frag_off; int offset; /* Check if the value of Router Alert * is for MLD (0x0000). */ if (opt->ra == htons(IPV6_OPT_ROUTERALERT_MLD)) { deliver = false; if (!ipv6_ext_hdr(nexthdr)) { /* BUG */ goto out; } offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); if (offset < 0) goto out; if (ipv6_is_mld(skb, nexthdr, offset)) deliver = true; goto out; } /* unknown RA - process it normally */ } if (deliver) skb2 = skb_clone(skb, GFP_ATOMIC); else { skb2 = skb; skb = NULL; } if (skb2) { ip6_mr_input(skb2); } } out: #endif if (likely(deliver)) ip6_input(skb); else { /* discard */ kfree_skb(skb); } return 0; }
3 70 14 31 31 21 38 12 16 4 40 6 33 2 29 33 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* * linux/can/skb.h * * Definitions for the CAN network socket buffer * * Copyright (C) 2012 Oliver Hartkopp <socketcan@hartkopp.net> * */ #ifndef _CAN_SKB_H #define _CAN_SKB_H #include <linux/types.h> #include <linux/skbuff.h> #include <linux/can.h> #include <net/sock.h> void can_flush_echo_skb(struct net_device *dev); int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *len_ptr, unsigned int *frame_len_ptr); unsigned int __must_check can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd); struct sk_buff *alloc_canxl_skb(struct net_device *dev, struct canxl_frame **cxl, unsigned int data_len); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf); bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb); /* * The struct can_skb_priv is used to transport additional information along * with the stored struct can(fd)_frame that can not be contained in existing * struct sk_buff elements. * N.B. that this information must not be modified in cloned CAN sk_buffs. * To modify the CAN frame content or the struct can_skb_priv content * skb_copy() needs to be used instead of skb_clone(). */ /** * struct can_skb_priv - private additional data inside CAN sk_buffs * @ifindex: ifindex of the first interface the CAN frame appeared on * @skbcnt: atomic counter to have an unique id together with skb pointer * @frame_len: length of CAN frame in data link layer * @cf: align to the following CAN frame at skb->data */ struct can_skb_priv { int ifindex; int skbcnt; unsigned int frame_len; struct can_frame cf[]; }; static inline struct can_skb_priv *can_skb_prv(struct sk_buff *skb) { return (struct can_skb_priv *)(skb->head); } static inline void can_skb_reserve(struct sk_buff *skb) { skb_reserve(skb, sizeof(struct can_skb_priv)); } static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) { /* If the socket has already been closed by user space, the * refcount may already be 0 (and the socket will be freed * after the last TX skb has been freed). So only increase * socket refcount if the refcount is > 0. */ if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb->destructor = sock_efree; skb->sk = sk; } } /* * returns an unshared skb owned by the original sock to be echo'ed back */ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) { struct sk_buff *nskb; nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { kfree_skb(skb); return NULL; } can_skb_set_owner(nskb, skb->sk); consume_skb(skb); return nskb; } static inline bool can_is_can_skb(const struct sk_buff *skb) { struct can_frame *cf = (struct can_frame *)skb->data; /* the CAN specific type of skb is identified by its data length */ return (skb->len == CAN_MTU && cf->len <= CAN_MAX_DLEN); } static inline bool can_is_canfd_skb(const struct sk_buff *skb) { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; /* the CAN specific type of skb is identified by its data length */ return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } static inline bool can_is_canxl_skb(const struct sk_buff *skb) { const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; if (skb->len < CANXL_HDR_SIZE + CANXL_MIN_DLEN || skb->len > CANXL_MTU) return false; /* this also checks valid CAN XL data length boundaries */ if (skb->len != CANXL_HDR_SIZE + cxl->len) return false; return cxl->flags & CANXL_XLF; } /* get length element value from can[|fd|xl]_frame structure */ static inline unsigned int can_skb_get_len_val(struct sk_buff *skb) { const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; if (can_is_canxl_skb(skb)) return cxl->len; return cfd->len; } /* get needed data length inside CAN frame for all frame types (RTR aware) */ static inline unsigned int can_skb_get_data_len(struct sk_buff *skb) { unsigned int len = can_skb_get_len_val(skb); const struct can_frame *cf = (struct can_frame *)skb->data; /* RTR frames have an actual length of zero */ if (can_is_can_skb(skb) && cf->can_id & CAN_RTR_FLAG) return 0; return len; } #endif /* !_CAN_SKB_H */
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (C) 2019 Netronome Systems, Inc. */ #ifndef __NET_TC_MPLS_H #define __NET_TC_MPLS_H #include <linux/tc_act/tc_mpls.h> #include <net/act_api.h> struct tcf_mpls_params { int tcfm_action; u32 tcfm_label; u8 tcfm_tc; u8 tcfm_ttl; u8 tcfm_bos; __be16 tcfm_proto; struct rcu_head rcu; }; #define ACT_MPLS_TC_NOT_SET 0xff #define ACT_MPLS_BOS_NOT_SET 0xff #define ACT_MPLS_LABEL_NOT_SET 0xffffffff struct tcf_mpls { struct tc_action common; struct tcf_mpls_params __rcu *mpls_p; }; #define to_mpls(a) ((struct tcf_mpls *)a) static inline bool is_tcf_mpls(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MPLS) return true; #endif return false; } static inline u32 tcf_mpls_action(const struct tc_action *a) { u32 tcfm_action; rcu_read_lock(); tcfm_action = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_action; rcu_read_unlock(); return tcfm_action; } static inline __be16 tcf_mpls_proto(const struct tc_action *a) { __be16 tcfm_proto; rcu_read_lock(); tcfm_proto = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_proto; rcu_read_unlock(); return tcfm_proto; } static inline u32 tcf_mpls_label(const struct tc_action *a) { u32 tcfm_label; rcu_read_lock(); tcfm_label = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_label; rcu_read_unlock(); return tcfm_label; } static inline u8 tcf_mpls_tc(const struct tc_action *a) { u8 tcfm_tc; rcu_read_lock(); tcfm_tc = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_tc; rcu_read_unlock(); return tcfm_tc; } static inline u8 tcf_mpls_bos(const struct tc_action *a) { u8 tcfm_bos; rcu_read_lock(); tcfm_bos = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_bos; rcu_read_unlock(); return tcfm_bos; } static inline u8 tcf_mpls_ttl(const struct tc_action *a) { u8 tcfm_ttl; rcu_read_lock(); tcfm_ttl = rcu_dereference(to_mpls(a)->mpls_p)->tcfm_ttl; rcu_read_unlock(); return tcfm_ttl; } #endif /* __NET_TC_MPLS_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 // SPDX-License-Identifier: GPL-2.0-only /* DVB USB compliant linux driver for mobile DVB-T USB devices based on * reference designs made by DiBcom (http://www.dibcom.fr/) (DiB3000M-C/P) * * Copyright (C) 2004-5 Patrick Boettcher (patrick.boettcher@posteo.de) * * based on GPL code from DiBcom, which has * Copyright (C) 2004 Amaury Demol for DiBcom * * see Documentation/driver-api/media/drivers/dvb-usb.rst for more information */ #include "dibusb.h" DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); /* USB Driver stuff */ static struct dvb_usb_device_properties dibusb_mc_properties; static int dibusb_mc_probe(struct usb_interface *intf, const struct usb_device_id *id) { return dvb_usb_device_init(intf, &dibusb_mc_properties, THIS_MODULE, NULL, adapter_nr); } /* do not change the order of the ID table */ enum { DIBCOM_MOD3001_COLD, DIBCOM_MOD3001_WARM, ULTIMA_TVBOX_USB2_COLD, ULTIMA_TVBOX_USB2_WARM, LITEON_DVB_T_COLD, LITEON_DVB_T_WARM, EMPIA_DIGIVOX_MINI_SL_COLD, EMPIA_DIGIVOX_MINI_SL_WARM, GRANDTEC_DVBT_USB2_COLD, GRANDTEC_DVBT_USB2_WARM, ULTIMA_ARTEC_T14_COLD, ULTIMA_ARTEC_T14_WARM, LEADTEK_WINFAST_DTV_DONGLE_COLD, LEADTEK_WINFAST_DTV_DONGLE_WARM, HUMAX_DVB_T_STICK_HIGH_SPEED_COLD, HUMAX_DVB_T_STICK_HIGH_SPEED_WARM, }; static struct usb_device_id dibusb_dib3000mc_table[] = { DVB_USB_DEV(DIBCOM, DIBCOM_MOD3001_COLD), DVB_USB_DEV(DIBCOM, DIBCOM_MOD3001_WARM), DVB_USB_DEV(ULTIMA_ELECTRONIC, ULTIMA_TVBOX_USB2_COLD), DVB_USB_DEV(ULTIMA_ELECTRONIC, ULTIMA_TVBOX_USB2_WARM), DVB_USB_DEV(LITEON, LITEON_DVB_T_COLD), DVB_USB_DEV(LITEON, LITEON_DVB_T_WARM), DVB_USB_DEV(EMPIA, EMPIA_DIGIVOX_MINI_SL_COLD), DVB_USB_DEV(EMPIA, EMPIA_DIGIVOX_MINI_SL_WARM), DVB_USB_DEV(GRANDTEC, GRANDTEC_DVBT_USB2_COLD), DVB_USB_DEV(GRANDTEC, GRANDTEC_DVBT_USB2_WARM), DVB_USB_DEV(ULTIMA_ELECTRONIC, ULTIMA_ARTEC_T14_COLD), DVB_USB_DEV(ULTIMA_ELECTRONIC, ULTIMA_ARTEC_T14_WARM), DVB_USB_DEV(LEADTEK, LEADTEK_WINFAST_DTV_DONGLE_COLD), DVB_USB_DEV(LEADTEK, LEADTEK_WINFAST_DTV_DONGLE_WARM), DVB_USB_DEV(HUMAX_COEX, HUMAX_DVB_T_STICK_HIGH_SPEED_COLD), DVB_USB_DEV(HUMAX_COEX, HUMAX_DVB_T_STICK_HIGH_SPEED_WARM), { } }; MODULE_DEVICE_TABLE (usb, dibusb_dib3000mc_table); static struct dvb_usb_device_properties dibusb_mc_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = CYPRESS_FX2, .firmware = "dvb-usb-dibusb-6.0.0.8.fw", .num_adapters = 1, .adapter = { { .num_frontends = 1, .fe = {{ .caps = DVB_USB_ADAP_HAS_PID_FILTER | DVB_USB_ADAP_PID_FILTER_CAN_BE_TURNED_OFF, .pid_filter_count = 32, .streaming_ctrl = dibusb2_0_streaming_ctrl, .pid_filter = dibusb_pid_filter, .pid_filter_ctrl = dibusb_pid_filter_ctrl, .frontend_attach = dibusb_dib3000mc_frontend_attach, .tuner_attach = dibusb_dib3000mc_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x06, .u = { .bulk = { .buffersize = 4096, } } }, }}, .size_of_priv = sizeof(struct dibusb_state), } }, .power_ctrl = dibusb2_0_power_ctrl, .rc.legacy = { .rc_interval = DEFAULT_RC_INTERVAL, .rc_map_table = rc_map_dibusb_table, .rc_map_size = 111, /* FIXME */ .rc_query = dibusb_rc_query, }, .i2c_algo = &dibusb_i2c_algo, .generic_bulk_ctrl_endpoint = 0x01, .num_device_descs = 8, .devices = { { "DiBcom USB2.0 DVB-T reference design (MOD3000P)", { &dibusb_dib3000mc_table[DIBCOM_MOD3001_COLD], NULL }, { &dibusb_dib3000mc_table[DIBCOM_MOD3001_WARM], NULL }, }, { "Artec T1 USB2.0 TVBOX (please check the warm ID)", { &dibusb_dib3000mc_table[ULTIMA_TVBOX_USB2_COLD], NULL }, { &dibusb_dib3000mc_table[ULTIMA_TVBOX_USB2_WARM], NULL }, }, { "LITE-ON USB2.0 DVB-T Tuner", /* Also rebranded as Intuix S800, Toshiba */ { &dibusb_dib3000mc_table[LITEON_DVB_T_COLD], NULL }, { &dibusb_dib3000mc_table[LITEON_DVB_T_WARM], NULL }, }, { "MSI Digivox Mini SL", { &dibusb_dib3000mc_table[EMPIA_DIGIVOX_MINI_SL_COLD], NULL }, { &dibusb_dib3000mc_table[EMPIA_DIGIVOX_MINI_SL_WARM], NULL }, }, { "GRAND - USB2.0 DVB-T adapter", { &dibusb_dib3000mc_table[GRANDTEC_DVBT_USB2_COLD], NULL }, { &dibusb_dib3000mc_table[GRANDTEC_DVBT_USB2_WARM], NULL }, }, { "Artec T14 - USB2.0 DVB-T", { &dibusb_dib3000mc_table[ULTIMA_ARTEC_T14_COLD], NULL }, { &dibusb_dib3000mc_table[ULTIMA_ARTEC_T14_WARM], NULL }, }, { "Leadtek - USB2.0 Winfast DTV dongle", { &dibusb_dib3000mc_table[LEADTEK_WINFAST_DTV_DONGLE_COLD], NULL }, { &dibusb_dib3000mc_table[LEADTEK_WINFAST_DTV_DONGLE_WARM], NULL }, }, { "Humax/Coex DVB-T USB Stick 2.0 High Speed", { &dibusb_dib3000mc_table[HUMAX_DVB_T_STICK_HIGH_SPEED_COLD], NULL }, { &dibusb_dib3000mc_table[HUMAX_DVB_T_STICK_HIGH_SPEED_WARM], NULL }, }, { NULL }, } }; static struct usb_driver dibusb_mc_driver = { .name = "dvb_usb_dibusb_mc", .probe = dibusb_mc_probe, .disconnect = dvb_usb_device_exit, .id_table = dibusb_dib3000mc_table, }; module_usb_driver(dibusb_mc_driver); MODULE_AUTHOR("Patrick Boettcher <patrick.boettcher@posteo.de>"); MODULE_DESCRIPTION("Driver for DiBcom USB2.0 DVB-T (DiB3000M-C/P based) devices"); MODULE_VERSION("1.0"); MODULE_LICENSE("GPL");
303 303 303 303 302 302 303 303 11 9621 3235 9625 9632 9630 9627 48 9510 8839 1304 9513 9530 5043 5058 5051 2662 2662 5063 4 5061 2 5043 5036 5045 1808 5009 1472 1472 525 2654 5990 2654 4384 4481 4370 27 4377 3092 72 72 3094 84 1558 47 5787 201 3092 2350 201 201 197 194 25 1 2919 35 83 5244 7018 7689 7687 5247 3527 3537 1227 56 1227 1456 1457 1450 5047 5047 5050 5046 5048 5040 5245 5249 5243 243 5254 5245 5244 5246 3102 2736 5044 16 5040 5045 5050 414 71 71 396 394 293 294 48 48 47 48 48 48 47 7 394 23 23 23 23 23 9 23 23 10 23 23 23 23 1819 1821 476 264 165 165 167 1752 1815 3970 3978 3459 3444 3447 3450 3448 1611 3968 5239 172 5246 200 5248 8334 6362 6372 6360 3557 3553 50 50 411 408 412 171 172 173 172 399 67 67 67 67 57 56 1734 1733 1 1 1 1736 1732 324 1736 1661 1735 1737 1315 1351 1311 1312 1349 375 375 29 29 375 364 364 1622 1617 114 113 1569 1574 1573 96 1574 1576 1623 372 28 372 371 21 371 372 993 991 992 1391 1386 1387 139 139 1386 3378 3374 3365 3367 3383 117 117 53 117 39 565 565 31 31 3 3 3 562 28 562 566 3 3 3 3 136 1 136 2047 5314 5319 2475 3510 2675 795 271 270 5053 566 568 5242 35 5248 7229 7336 7206 7229 5330 5336 177 177 177 2099 1736 974 637 970 2505 1432 1207 1436 1400 1436 1024 1259 2498 1261 1250 2497 2425 2389 2425 968 2418 2623 344 2361 2529 2524 2516 2502 2325 2149 2116 2093 2101 1847 2297 1307 1303 1306 1304 1308 434 435 2012 2009 7 1970 1972 1964 1743 1732 1731 7 7 1741 1434 4854 2605 6 1 4843 719 1068 2091 1878 2100 1497 2097 2096 1532 1536 1531 1577 1578 1054 1568 649 720 721 76 585 653 647 94 86 398 398 312 450 291 166 57 108 39 69 16 3070 1765 13 5 3060 3073 1258 1260 47 47 1219 6 624 1511 1512 177 7918 7916 9991 9980 9992 9983 1652 708 706 9986 8744 5955 4076 96 91 42 4072 1724 1648 3967 3962 3957 8752 328 329 2166 2165 11 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1997 Linus Torvalds * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) */ #include <linux/export.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/hash.h> #include <linux/swap.h> #include <linux/security.h> #include <linux/cdev.h> #include <linux/memblock.h> #include <linux/fsnotify.h> #include <linux/mount.h> #include <linux/posix_acl.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> #include <linux/iversion.h> #include <linux/rw_hint.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <trace/events/writeback.h> #define CREATE_TRACE_POINTS #include <trace/events/timestamp.h> #include "internal.h" /* * Inode locking rules: * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget(), inode->i_io_list * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode->i_sb->s_inode_list_lock protects: * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * * Lock ordering: * * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock * * inode_hash_lock * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock * inode_hash_lock */ static unsigned int i_hash_mask __ro_after_init; static unsigned int i_hash_shift __ro_after_init; static struct hlist_head *inode_hashtable __ro_after_init; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. */ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __ro_after_init; static long get_nr_inodes(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } static inline long get_nr_inodes_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } #ifdef CONFIG_DEBUG_FS static DEFINE_PER_CPU(long, mg_ctime_updates); static DEFINE_PER_CPU(long, mg_fine_stamps); static DEFINE_PER_CPU(long, mg_ctime_swaps); static unsigned long get_mg_ctime_updates(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_ctime_updates, i)); return sum; } static unsigned long get_mg_fine_stamps(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_fine_stamps, i)); return sum; } static unsigned long get_mg_ctime_swaps(void) { unsigned long sum = 0; int i; for_each_possible_cpu(i) sum += data_race(per_cpu(mg_ctime_swaps, i)); return sum; } #define mgtime_counter_inc(__var) this_cpu_inc(__var) static int mgts_show(struct seq_file *s, void *p) { unsigned long ctime_updates = get_mg_ctime_updates(); unsigned long ctime_swaps = get_mg_ctime_swaps(); unsigned long fine_stamps = get_mg_fine_stamps(); unsigned long floor_swaps = timekeeping_get_mg_floor_swaps(); seq_printf(s, "%lu %lu %lu %lu\n", ctime_updates, ctime_swaps, fine_stamps, floor_swaps); return 0; } DEFINE_SHOW_ATTRIBUTE(mgts); static int __init mg_debugfs_init(void) { debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops); return 0; } late_initcall(mg_debugfs_init); #else /* ! CONFIG_DEBUG_FS */ #define mgtime_counter_inc(__var) do { } while (0) #endif /* CONFIG_DEBUG_FS */ /* * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL /* * Statistics gathering.. */ static struct inodes_stat_t inodes_stat; static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table inodes_sysctls[] = { { .procname = "inode-nr", .data = &inodes_stat, .maxlen = 2*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, }; static int __init init_fs_inode_sysctls(void) { register_sysctl_init("fs", inodes_sysctls); return 0; } early_initcall(init_fs_inode_sysctls); #endif static int no_open(struct inode *inode, struct file *file) { return -ENXIO; } /** * inode_init_always_gfp - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise * @gfp: allocation flags * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. * If there are additional allocations required @gfp is used. */ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp) { static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; inode->i_state = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; if (sb->s_type->fs_flags & FS_MGTIME) inode->i_opflags |= IOP_MGTIME; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; #ifdef CONFIG_CGROUP_WRITEBACK inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; #endif spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); if (sb->s_iflags & SB_I_STABLE_WRITES) mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif #ifdef CONFIG_FSNOTIFY inode->i_fsnotify_mask = 0; #endif inode->i_flctx = NULL; if (unlikely(security_inode_alloc(inode, gfp))) return -ENOMEM; this_cpu_inc(nr_inodes); return 0; } EXPORT_SYMBOL(inode_init_always_gfp); void free_inode_nonrcu(struct inode *inode) { kmem_cache_free(inode_cachep, inode); } EXPORT_SYMBOL(free_inode_nonrcu); static void i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); if (inode->free_inode) inode->free_inode(inode); else free_inode_nonrcu(inode); } static struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; if (ops->alloc_inode) inode = ops->alloc_inode(sb); else inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; if (unlikely(inode_init_always(sb, inode))) { if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return NULL; } inode->free_inode = ops->free_inode; i_callback(&inode->i_rcu); return NULL; } return inode; } void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode); if (!inode->i_nlink) { WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count); } #ifdef CONFIG_FS_POSIX_ACL if (inode->i_acl && !is_uncached_acl(inode->i_acl)) posix_acl_release(inode->i_acl); if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) posix_acl_release(inode->i_default_acl); #endif this_cpu_dec(nr_inodes); } EXPORT_SYMBOL(__destroy_inode); static void destroy_inode(struct inode *inode) { const struct super_operations *ops = inode->i_sb->s_op; BUG_ON(!list_empty(&inode->i_lru)); __destroy_inode(inode); if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return; } inode->free_inode = ops->free_inode; call_rcu(&inode->i_rcu, i_callback); } /** * drop_nlink - directly drop an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. In cases * where we are attempting to track writes to the * filesystem, a decrement to zero means an imminent * write when the file is truncated and actually unlinked * on the filesystem. */ void drop_nlink(struct inode *inode) { WARN_ON(inode->i_nlink == 0); inode->__i_nlink--; if (!inode->i_nlink) atomic_long_inc(&inode->i_sb->s_remove_count); } EXPORT_SYMBOL(drop_nlink); /** * clear_nlink - directly zero an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. See * drop_nlink() for why we care about i_nlink hitting zero. */ void clear_nlink(struct inode *inode) { if (inode->i_nlink) { inode->__i_nlink = 0; atomic_long_inc(&inode->i_sb->s_remove_count); } } EXPORT_SYMBOL(clear_nlink); /** * set_nlink - directly set an inode's link count * @inode: inode * @nlink: new nlink (should be non-zero) * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. */ void set_nlink(struct inode *inode, unsigned int nlink) { if (!nlink) { clear_nlink(inode); } else { /* Yes, some filesystems do change nlink from zero to one */ if (inode->i_nlink == 0) atomic_long_dec(&inode->i_sb->s_remove_count); inode->__i_nlink = nlink; } } EXPORT_SYMBOL(set_nlink); /** * inc_nlink - directly increment an inode's link count * @inode: inode * * This is a low-level filesystem helper to replace any * direct filesystem manipulation of i_nlink. Currently, * it is only here for parity with dec_nlink(). */ void inc_nlink(struct inode *inode) { if (unlikely(inode->i_nlink == 0)) { WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); } inode->__i_nlink++; } EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->i_private_list); spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); __address_space_init_once(mapping); } EXPORT_SYMBOL(address_space_init_once); /* * These are initializations that only need to be done * once, because the fields are idempotent across use * of the inode, so let the slab aware of that. */ void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } EXPORT_SYMBOL(inode_init_once); static void init_once(void *foo) { struct inode *inode = (struct inode *) foo; inode_init_once(inode); } /* * get additional reference to inode; caller must already hold one. */ void ihold(struct inode *inode) { WARN_ON(atomic_inc_return(&inode->i_count) < 2); } EXPORT_SYMBOL(ihold); static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; if (atomic_read(&inode->i_count)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; if (!mapping_shrinkable(&inode->i_data)) return; if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); else if (rotate) inode->i_state |= I_REFERENCED; } struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { void *bit_address; bit_address = inode_state_wait_address(inode, bit); init_wait_var_entry(wqe, bit_address, 0); return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); /* * Add inode to LRU if needed (inode is unused and clean). * * Needs inode->i_lock held. */ void inode_add_lru(struct inode *inode) { __inode_add_lru(inode, false); } static void inode_lru_list_del(struct inode *inode) { if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } static void inode_pin_lru_isolating(struct inode *inode) { lockdep_assert_held(&inode->i_lock); WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); inode->i_state |= I_LRU_ISOLATING; } static void inode_unpin_lru_isolating(struct inode *inode) { spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); inode->i_state &= ~I_LRU_ISOLATING; /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); } static void inode_wait_for_lru_isolating(struct inode *inode) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; lockdep_assert_held(&inode->i_lock); if (!(inode->i_state & I_LRU_ISOLATING)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); for (;;) { prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* * Checking I_LRU_ISOLATING with inode->i_lock guarantees * memory ordering. */ if (!(inode->i_state & I_LRU_ISOLATING)) break; spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); } finish_wait(wq_head, &wqe.wq_entry); WARN_ON(inode->i_state & I_LRU_ISOLATING); } /** * inode_sb_list_add - add inode to the superblock list of inodes * @inode: inode to add */ void inode_sb_list_add(struct inode *inode) { spin_lock(&inode->i_sb->s_inode_list_lock); list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); spin_unlock(&inode->i_sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { if (!list_empty(&inode->i_sb_list)) { spin_lock(&inode->i_sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); spin_unlock(&inode->i_sb->s_inode_list_lock); } } static unsigned long hash(struct super_block *sb, unsigned long hashval) { unsigned long tmp; tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / L1_CACHE_BYTES; tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); return tmp & i_hash_mask; } /** * __insert_inode_hash - hash an inode * @inode: unhashed inode * @hashval: unsigned long value used to locate this object in the * inode_hashtable. * * Add an inode to the inode hash for this superblock. */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_add_head_rcu(&inode->i_hash, b); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); /** * __remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * * Remove an inode from the superblock. */ void __remove_inode_hash(struct inode *inode) { spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_del_init_rcu(&inode->i_hash); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__remove_inode_hash); void dump_mapping(const struct address_space *mapping) { struct inode *host; const struct address_space_operations *a_ops; struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; char fname[64] = {}; unsigned long ino; /* * If mapping is an invalid pointer, we don't want to crash * accessing it, so probe everything depending on it carefully. */ if (get_kernel_nofault(host, &mapping->host) || get_kernel_nofault(a_ops, &mapping->a_ops)) { pr_warn("invalid mapping:%px\n", mapping); return; } if (!host) { pr_warn("aops:%ps\n", a_ops); return; } if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || get_kernel_nofault(ino, &host->i_ino)) { pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); return; } if (!dentry_first) { pr_warn("aops:%ps ino:%lx\n", a_ops, ino); return; } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); if (get_kernel_nofault(dentry, dentry_ptr) || !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; } if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0) strscpy(fname, "<invalid>"); /* * Even if strncpy_from_kernel_nofault() succeeded, * the fname could be unreliable */ pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", a_ops, ino, fname); } void clear_inode(struct inode *inode) { /* * We have to cycle the i_pages lock here because reclaim can be in the * process of removing the last page (in __filemap_remove_folio()) * and we must not free the mapping under it. */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); /* * Almost always, mapping_empty(&inode->i_data) here; but there are * two known and long-standing ways in which nodes may get left behind * (when deep radix-tree node allocation failed partway; or when THP * collapse_file() failed). Until those two known cases are cleaned up, * or a cleanup function is called here, do not BUG_ON(!mapping_empty), * nor even WARN_ON(!mapping_empty). */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } EXPORT_SYMBOL(clear_inode); /* * Free the inode passed in, removing it from the lists it is still connected * to. We remove any pages still attached to the inode and wait for any IO that * is still in progress before finally destroying the inode. * * An inode must already be marked I_FREEING so that we avoid the inode being * moved back onto lists if we race with other code that manipulates the lists * (e.g. writeback_single_inode). The caller is responsible for setting this. * * An inode must already be removed from the LRU list before being evicted from * the cache. This should occur atomically with setting the I_FREEING state * flag, so no inodes here should ever be on the LRU when being evicted. */ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); if (!list_empty(&inode->i_io_list)) inode_io_list_del(inode); inode_sb_list_del(inode); spin_lock(&inode->i_lock); inode_wait_for_lru_isolating(inode); /* * Wait for flusher thread to be done with the inode so that filesystem * does not start destroying it while writeback is still running. Since * the inode has I_FREEING set, flusher thread won't start new work on * the inode. We just have to wait for running writeback to finish. */ inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); if (op->evict_inode) { op->evict_inode(inode); } else { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); remove_inode_hash(inode); /* * Wake up waiters in __wait_on_freeing_inode(). * * Lockless hash lookup may end up finding the inode before we removed * it above, but only lock it *after* we are done with the wakeup below. * In this case the potential waiter cannot safely block. * * The inode being unhashed after the call to remove_inode_hash() is * used as an indicator whether blocking on it is safe. */ spin_lock(&inode->i_lock); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); destroy_inode(inode); } /* * dispose_list - dispose of the contents of a local list * @head: the head of the list to free * * Dispose-list gets a local list with local inodes in it, so it doesn't * need to worry about list corruption and SMP locks. */ static void dispose_list(struct list_head *head) { while (!list_empty(head)) { struct inode *inode; inode = list_first_entry(head, struct inode, i_lru); list_del_init(&inode->i_lru); evict(inode); cond_resched(); } } /** * evict_inodes - evict all evictable inodes for a superblock * @sb: superblock to operate on * * Make sure that no inodes with zero refcount are retained. This is * called by superblock shutdown after having SB_ACTIVE flag removed, * so any inode reaching zero refcount during or after that call will * be immediately evicted. */ void evict_inodes(struct super_block *sb) { struct inode *inode, *next; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; spin_lock(&inode->i_lock); if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); /* * We can have a ton of inodes to evict at unmount time given * enough memory, check to see if we need to go to sleep for a * bit so we don't livelock. */ if (need_resched()) { spin_unlock(&sb->s_inode_list_lock); cond_resched(); dispose_list(&dispose); goto again; } } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on * * Attempts to free all inodes (including dirty inodes) for a given superblock. */ void invalidate_inodes(struct super_block *sb) { struct inode *inode, *next; LIST_HEAD(dispose); again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; } if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } inode->i_state |= I_FREEING; inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); if (need_resched()) { spin_unlock(&sb->s_inode_list_lock); cond_resched(); dispose_list(&dispose); goto again; } } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } /* * Isolate the inode from the LRU in preparation for freeing it. * * If the inode has the I_REFERENCED flag set, then it means that it has been * used recently - the flag is set in iput_final(). When we encounter such an * inode, clear the flag and move it to the back of the LRU so it gets another * pass through the LRU before it gets reclaimed. This is necessary because of * the fact we are doing lazy LRU updates to minimise lock contention so the * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ static enum lru_status inode_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); /* * We are inverting the lru lock/inode->i_lock here, so use a * trylock. If we fail to get the lock, just skip it. */ if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; /* * Inodes can get referenced, redirtied, or repopulated while * they're already on the LRU, and this can make them * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* Recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; } /* * On highmem systems, mapping_shrinkable() permits dropping * page cache in order to free up struct inodes: lowmem might * be under pressure before the cache inside the highmem zone. */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); spin_unlock(&lru->lock); if (remove_inode_buffers(inode)) { unsigned long reap; reap = invalidate_mapping_pages(&inode->i_data, 0, -1); if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); mm_account_reclaimed_pages(reap); } inode_unpin_lru_isolating(inode); return LRU_RETRY; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } /* * Walk the superblock inode LRU for freeable inodes and attempt to free them. * This is called from the superblock shrinker function with a number of inodes * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(freeable); long freed; freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, inode_lru_isolate, &freeable); dispose_list(&freeable); return freed; } static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data, bool is_inode_hash_locked) { struct inode *inode = NULL; if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); rcu_read_lock(); repeat: hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; } rcu_read_unlock(); return NULL; } /* * find_inode_fast is the fast path version of find_inode, see the comment at * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, bool is_inode_hash_locked) { struct inode *inode = NULL; if (is_inode_hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); rcu_read_lock(); repeat: hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode, is_inode_hash_locked); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); return ERR_PTR(-ESTALE); } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); return inode; } rcu_read_unlock(); return NULL; } /* * Each cpu owns a range of LAST_INO_BATCH numbers. * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, * to renew the exhausted range. * * This does not significantly increase overflow rate because every CPU can * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the * 2^32 range, and is a worst-case. Even a 50% wastage would only increase * overflow rate by 2x, which does not seem too significant. * * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ #define LAST_INO_BATCH 1024 static DEFINE_PER_CPU(unsigned int, last_ino); unsigned int get_next_ino(void) { unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; #ifdef CONFIG_SMP if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { static atomic_t shared_last_ino; int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); res = next - LAST_INO_BATCH; } #endif res++; /* get_next_ino should not provide a 0 inode number */ if (unlikely(!res)) res++; *p = res; put_cpu_var(last_ino); return res; } EXPORT_SYMBOL(get_next_ino); /** * new_inode_pseudo - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. * Inode wont be chained in superblock s_inodes list * This means : * - fs can't be unmount * - quotas, fsnotify, writeback can't work */ struct inode *new_inode_pseudo(struct super_block *sb) { return alloc_inode(sb); } /** * new_inode - obtain an inode * @sb: superblock * * Allocates a new inode for given superblock. The default gfp_mask * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. * If HIGHMEM pages are unsuitable or it is known that pages allocated * for the page cache are not reclaimable or migratable, * mapping_set_gfp_mask() must be called with suitable flags on the * newly created inode's mapping * */ struct inode *new_inode(struct super_block *sb) { struct inode *inode; inode = new_inode_pseudo(sb); if (inode) inode_sb_list_add(inode); return inode; } EXPORT_SYMBOL(new_inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC void lockdep_annotate_inode_mutex_key(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_mutex */ // mutex_destroy(&inode->i_mutex); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); } } } EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); #endif /** * unlock_new_inode - clear the I_NEW state and wake up any waiters * @inode: new inode to unlock * * Called when the inode is fully initialised to clear the new state of the * inode and wake up anyone waiting for the inode to finish initialisation. */ void unlock_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); void discard_new_inode(struct inode *inode) { lockdep_annotate_inode_mutex_key(inode); spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } EXPORT_SYMBOL(discard_new_inode); /** * lock_two_nondirectories - take two i_mutexes on non-directory objects * * Lock any non-NULL argument. Passed objects must not be directories. * Zero, one or two objects may be locked by this function. * * @inode1: first inode to lock * @inode2: second inode to lock */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1) WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); if (inode2) WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); if (inode1 > inode2) swap(inode1, inode2); if (inode1) inode_lock(inode1); if (inode2 && inode2 != inode1) inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); /** * unlock_two_nondirectories - release locks from lock_two_nondirectories() * @inode1: first inode to unlock * @inode2: second inode to unlock */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1) { WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); inode_unlock(inode1); } if (inode2 && inode2 != inode1) { WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); inode_unlock(inode2); } } EXPORT_SYMBOL(unlock_two_nondirectories); /** * inode_insert5 - obtain an inode from a mounted file system * @inode: pre-allocated inode to use for insert to cache * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a * variant of iget5_locked() that doesn't allocate an inode. * * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. * Use the old inode instead of the preallocated one. */ spin_unlock(&inode_hash_lock); if (IS_ERR(old)) return NULL; wait_on_inode(old); if (unlikely(inode_unhashed(old))) { iput(old); goto again; } return old; } if (set && unlikely(set(inode, data))) { inode = NULL; goto unlock; } /* * Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); /* * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); return inode; } EXPORT_SYMBOL(inode_insert5); /** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, * and if present return it with an increased reference count. This is a * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * * If the inode is not present in the cache, allocate and insert a new inode * and return it locked, hashed, and with the I_NEW flag set. The file system * gets to fill it in before unlocking it via unlock_new_inode(). * * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct inode *inode = ilookup5(sb, hashval, test, data); if (!inode) { struct inode *new = alloc_inode(sb); if (new) { inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } } return inode; } EXPORT_SYMBOL(iget5_locked); /** * iget5_locked_rcu - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get * @test: callback used for comparisons between inodes * @set: callback used to initialize a new struct inode * @data: opaque data pointer to pass to @test and @set * * This is equivalent to iget5_locked, except the @test callback must * tolerate the inode not being stable, including being mid-teardown. */ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; again: inode = find_inode(sb, head, test, data, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } new = alloc_inode(sb); if (new) { inode = inode_insert5(new, hashval, test, set, data); if (unlikely(inode != new)) destroy_inode(new); } return inode; } EXPORT_SYMBOL_GPL(iget5_locked_rcu); /** * iget_locked - obtain an inode from a mounted file system * @sb: super block of file system * @ino: inode number to get * * Search for the inode specified by @ino in the inode cache and if present * return it with an increased reference count. This is for file systems * where the inode number is sufficient for unique identification of an inode. * * If the inode is not in cache, allocate a new inode and return it locked, * hashed, and with the I_NEW flag set. The file system gets to fill it in * before unlocking it via unlock_new_inode(). */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } return inode; } inode = alloc_inode(sb); if (inode) { struct inode *old; spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino, true); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents */ return inode; } /* * Uhhuh, somebody else created the same inode under * us. Use the old inode instead of the one we just * allocated. */ spin_unlock(&inode_hash_lock); destroy_inode(inode); if (IS_ERR(old)) return NULL; inode = old; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(iget_locked); /* * search the inode cache for a matching inode number. * If we find one, then the inode number we are trying to * allocate is not unique and so we should not use it. * * Returns 1 if the inode number is unique, 0 if it is not. */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; hlist_for_each_entry_rcu(inode, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) return 0; } return 1; } /** * iunique - get a unique inode number * @sb: superblock * @max_reserved: highest reserved inode number * * Obtain an inode number that is unique on the system for a given * superblock. This is used by file systems that have no natural * permanent inode numbering system. An inode number is returned that * is higher than the reserved limit but unique. * * BUGS: * With a large number of inodes live on the file system this function * currently becomes quite slow. */ ino_t iunique(struct super_block *sb, ino_t max_reserved) { /* * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW * error if st_ino won't fit in target struct field. Use 32bit counter * here to attempt to avoid that. */ static DEFINE_SPINLOCK(iunique_lock); static unsigned int counter; ino_t res; rcu_read_lock(); spin_lock(&iunique_lock); do { if (counter <= max_reserved) counter = max_reserved + 1; res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); rcu_read_unlock(); return res; } EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode->i_lock); if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; } return inode; } EXPORT_SYMBOL(igrab); /** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented * reference count. * * Note: I_NEW is not waited upon so you have to be very careful what you do * with the returned inode. You probably should be using ilookup5() instead. * * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data, true); spin_unlock(&inode_hash_lock); return IS_ERR(inode) ? NULL : inode; } EXPORT_SYMBOL(ilookup5_nowait); /** * ilookup5 - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * * Search for the inode specified by @hashval and @data in the inode cache, * and if the inode is in the cache, return the inode with an incremented * reference count. Waits on I_NEW before returning the inode. * returned with an incremented reference count. * * This is a generalized version of ilookup() for file systems where the * inode number is not sufficient for unique identification of an inode. * * Note: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup5); /** * ilookup - search for an inode in the inode cache * @sb: super block of file system to search * @ino: inode number to search for * * Search for the inode @ino in the inode cache, and if the inode is in the * cache, the inode is returned with an incremented reference count. */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) return NULL; wait_on_inode(inode); if (unlikely(inode_unhashed(inode))) { iput(inode); goto again; } } return inode; } EXPORT_SYMBOL(ilookup); /** * find_inode_nowait - find an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @match: callback used for comparisons between inodes * @data: opaque data pointer to pass to @match * * Search for the inode specified by @hashval and @data in the inode * cache, where the helper function @match will return 0 if the inode * does not match, 1 if the inode does match, and -1 if the search * should be stopped. The @match function must be responsible for * taking the i_lock spin_lock and checking i_state for an inode being * freed or being initialized, and incrementing the reference count * before returning 1. It also must not sleep, since it is called with * the inode_hash_lock spinlock held. * * This is a even more generalized version of ilookup5() when the * function must never block --- find_inode() can block in * __wait_on_freeing_inode() --- or when the caller can not increment * the reference count because the resulting iput() might cause an * inode eviction. The tradeoff is that the @match funtion must be * very carefully implemented. */ struct inode *find_inode_nowait(struct super_block *sb, unsigned long hashval, int (*match)(struct inode *, unsigned long, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *ret_inode = NULL; int mval; spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, head, i_hash) { if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); if (mval == 0) continue; if (mval == 1) ret_inode = inode; goto out; } out: spin_unlock(&inode_hash_lock); return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); /** * find_inode_rcu - find an inode in the inode cache * @sb: Super block of file system to search * @hashval: Key to hash * @test: Function to test match on an inode * @data: Data for test function * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_rcu); /** * find_inode_by_ino_rcu - Find an inode in the inode cache * @sb: Super block of file system to search * @ino: The inode number to match * * Search for the inode specified by @hashval and @data in the inode cache, * where the helper function @test will return 0 if the inode does not match * and 1 if it does. The @test function must be responsible for taking the * i_lock spin_lock and checking i_state for an inode being freed or being * initialized. * * If successful, this will return the inode for which the @test function * returned 1 and NULL otherwise. * * The @test function is not permitted to take a ref on any inode presented. * It is also not permitted to sleep. * * The caller must hold the RCU read lock. */ struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_by_ino_rcu() usage"); hlist_for_each_entry_rcu(inode, head, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) return inode; } return NULL; } EXPORT_SYMBOL(find_inode_by_ino_rcu); int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; spin_lock(&old->i_lock); if (old->i_state & (I_FREEING|I_WILL_FREE)) { spin_unlock(&old->i_lock); continue; } break; } if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); return 0; } if (unlikely(old->i_state & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); return -EBUSY; } __iget(old); spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); return -EBUSY; } iput(old); } } EXPORT_SYMBOL(insert_inode_locked); int insert_inode_locked4(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *old; inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); if (old != inode) { iput(old); return -EBUSY; } return 0; } EXPORT_SYMBOL(insert_inode_locked4); int generic_delete_inode(struct inode *inode) { return 1; } EXPORT_SYMBOL(generic_delete_inode); /* * Called when we're dropping the last reference * to an inode. * * Call the FS "drop_inode()" function, defaulting to * the legacy UNIX filesystem behaviour. If it tells * us to evict inode, do so. Otherwise, retain inode * in cache if fs is alive, sync and evict if fs is * shutting down. */ static void iput_final(struct inode *inode) { struct super_block *sb = inode->i_sb; const struct super_operations *op = inode->i_sb->s_op; unsigned long state; int drop; WARN_ON(inode->i_state & I_NEW); if (op->drop_inode) drop = op->drop_inode(inode); else drop = generic_drop_inode(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { __inode_add_lru(inode, true); spin_unlock(&inode->i_lock); return; } state = inode->i_state; if (!drop) { WRITE_ONCE(inode->i_state, state | I_WILL_FREE); spin_unlock(&inode->i_lock); write_inode_now(inode, 1); spin_lock(&inode->i_lock); state = inode->i_state; WARN_ON(state & I_NEW); state &= ~I_WILL_FREE; } WRITE_ONCE(inode->i_state, state | I_FREEING); if (!list_empty(&inode->i_lru)) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); evict(inode); } /** * iput - put an inode * @inode: inode to put * * Puts an inode, dropping its usage count. If the inode use count hits * zero, the inode is then freed and may also be destroyed. * * Consequently, iput() can sleep. */ void iput(struct inode *inode) { if (!inode) return; BUG_ON(inode->i_state & I_CLEAR); retry: if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); goto retry; } iput_final(inode); } } EXPORT_SYMBOL(iput); #ifdef CONFIG_BLOCK /** * bmap - find a block number in a file * @inode: inode owning the block number being requested * @block: pointer containing the block to find * * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { if (!inode->i_mapping->a_ops->bmap) return -EINVAL; *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block); return 0; } EXPORT_SYMBOL(bmap); #endif /* * With relative atime, only update atime if the previous atime is * earlier than or equal to either the ctime or mtime, * or if at least a day has passed since the last atime update. */ static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) return true; /* * Is mtime younger than or equal to atime? If yes, update atime: */ atime = inode_get_atime(inode); mtime = inode_get_mtime(inode); if (timespec64_compare(&mtime, &atime) >= 0) return true; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); if (timespec64_compare(&ctime, &atime) >= 0) return true; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) return true; /* * Good, we can skip the atime update: */ return false; } /** * inode_update_timestamps - update the timestamps on the inode * @inode: inode to be updated * @flags: S_* flags that needed to be updated * * The update_time function is called when an inode's timestamps need to be * updated for a read or write operation. This function handles updating the * actual timestamps. It's up to the caller to ensure that the inode is marked * dirty appropriately. * * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated, * attempt to update all three of them. S_ATIME updates can be handled * independently of the rest. * * Returns a set of S_* flags indicating which values changed. */ int inode_update_timestamps(struct inode *inode, int flags) { int updated = 0; struct timespec64 now; if (flags & (S_MTIME|S_CTIME|S_VERSION)) { struct timespec64 ctime = inode_get_ctime(inode); struct timespec64 mtime = inode_get_mtime(inode); now = inode_set_ctime_current(inode); if (!timespec64_equal(&now, &ctime)) updated |= S_CTIME; if (!timespec64_equal(&now, &mtime)) { inode_set_mtime_to_ts(inode, now); updated |= S_MTIME; } if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated)) updated |= S_VERSION; } else { now = current_time(inode); } if (flags & S_ATIME) { struct timespec64 atime = inode_get_atime(inode); if (!timespec64_equal(&now, &atime)) { inode_set_atime_to_ts(inode, now); updated |= S_ATIME; } } return updated; } EXPORT_SYMBOL(inode_update_timestamps); /** * generic_update_time - update the timestamps on the inode * @inode: inode to be updated * @flags: S_* flags that needed to be updated * * The update_time function is called when an inode's timestamps need to be * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME, * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME * updates can be handled done independently of the rest. * * Returns a S_* mask indicating which fields were updated. */ int generic_update_time(struct inode *inode, int flags) { int updated = inode_update_timestamps(inode, flags); int dirty_flags = 0; if (updated & (S_ATIME|S_MTIME|S_CTIME)) dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC; if (updated & S_VERSION) dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags); return updated; } EXPORT_SYMBOL(generic_update_time); /* * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ int inode_update_time(struct inode *inode, int flags) { if (inode->i_op->update_time) return inode->i_op->update_time(inode, flags); generic_update_time(inode, flags); return 0; } EXPORT_SYMBOL(inode_update_time); /** * atime_needs_update - update the access time * @path: the &struct path to update * @inode: inode to update * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; struct timespec64 now, atime; if (inode->i_flags & S_NOATIME) return false; /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode)) return false; if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; if (mnt->mnt_flags & MNT_NOATIME) return false; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; now = current_time(inode); if (!relatime_need_update(mnt, inode, now)) return false; atime = inode_get_atime(inode); if (timespec64_equal(&atime, &now)) return false; return true; } void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); if (!atime_needs_update(path, inode)) return; if (!sb_start_write_trylock(inode->i_sb)) return; if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to * allocate new space to modify an inode (such is the case for * Btrfs), but since we touch atime while walking down the path we * really don't care if we failed to update the atime of the file, * so just ignore the return value. * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ inode_update_time(inode, S_ATIME); mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); /* * Return mask of changes for notify_change() that need to be done as a * response to write or truncate. Return 0 if nothing has to be changed. * Negative value on error (change should be denied). */ int dentry_needs_remove_privs(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int mask = 0; int ret; if (IS_NOSEC(inode)) return 0; mask = setattr_should_drop_suidgid(idmap, inode); ret = security_inode_need_killpriv(dentry); if (ret < 0) return ret; if (ret) mask |= ATTR_KILL_PRIV; return mask; } static int __remove_privs(struct mnt_idmap *idmap, struct dentry *dentry, int kill) { struct iattr newattrs; newattrs.ia_valid = ATTR_FORCE | kill; /* * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ return notify_change(idmap, dentry, &newattrs, NULL); } int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); int error = 0; int kill; if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry); if (kill < 0) return kill; if (kill) { if (flags & IOCB_NOWAIT) return -EAGAIN; error = __remove_privs(file_mnt_idmap(file), dentry, kill); } if (!error) inode_has_no_xattr(inode); return error; } EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) * @file: file to remove privileges from * * When file is modified by a write or truncation ensure that special * file privileges are removed. * * Return: 0 on success, negative errno on failure. */ int file_remove_privs(struct file *file) { return file_remove_privs_flags(file, 0); } EXPORT_SYMBOL(file_remove_privs); /** * current_time - Return FS time (possibly fine-grained) * @inode: inode. * * Return the current time truncated to the time granularity supported by * the fs, as suitable for a ctime/mtime change. If the ctime is flagged * as having been QUERIED, get a fine-grained timestamp, but don't update * the floor. * * For a multigrain inode, this is effectively an estimate of the timestamp * that a file would receive. An actual update must go through * inode_set_ctime_current(). */ struct timespec64 current_time(struct inode *inode) { struct timespec64 now; u32 cns; ktime_get_coarse_real_ts64_mg(&now); if (!is_mgtime(inode)) goto out; /* If nothing has queried it, then coarse time is fine */ cns = smp_load_acquire(&inode->i_ctime_nsec); if (cns & I_CTIME_QUERIED) { /* * If there is no apparent change, then get a fine-grained * timestamp. */ if (now.tv_nsec == (cns & ~I_CTIME_QUERIED)) ktime_get_real_ts64(&now); } out: return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); static int inode_needs_update_time(struct inode *inode) { struct timespec64 now, ts; int sync_it = 0; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; now = current_time(inode); ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) sync_it |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) sync_it |= S_VERSION; return sync_it; } static int __file_update_time(struct file *file, int sync_mode) { int ret = 0; struct inode *inode = file_inode(file); /* try to update time settings */ if (!mnt_get_write_access_file(file)) { ret = inode_update_time(inode, sync_mode); mnt_put_write_access_file(file); } return ret; } /** * file_update_time - update mtime and ctime time * @file: file accessed * * Update the mtime and ctime members of an inode and mark the inode for * writeback. Note that this function is meant exclusively for usage in * the file write path of filesystems, and filesystems may choose to * explicitly ignore updates via this function with the _NOCMTIME inode * flag, e.g. for network filesystem where these imestamps are handled * by the server. This can return an error for file systems who need to * allocate space in order to update an inode. * * Return: 0 on success, negative errno on failure. */ int file_update_time(struct file *file) { int ret; struct inode *inode = file_inode(file); ret = inode_needs_update_time(inode); if (ret <= 0) return ret; return __file_update_time(file, ret); } EXPORT_SYMBOL(file_update_time); /** * file_modified_flags - handle mandated vfs changes when modifying a file * @file: file that was modified * @flags: kiocb flags * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * If IOCB_NOWAIT is set, special file privileges will not be removed and * time settings will not be updated. It will return -EAGAIN. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ ret = file_remove_privs_flags(file, flags); if (ret) return ret; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; ret = inode_needs_update_time(inode); if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; return __file_update_time(file, ret); } /** * file_modified - handle mandated vfs changes when modifying a file * @file: file that was modified * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ int file_modified(struct file *file) { return file_modified_flags(file, 0); } EXPORT_SYMBOL(file_modified); /** * kiocb_modified - handle mandated vfs changes when modifying a file * @iocb: iocb that was modified * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ int kiocb_modified(struct kiocb *iocb) { return file_modified_flags(iocb->ki_filp, iocb->ki_flags); } EXPORT_SYMBOL_GPL(kiocb_modified); int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) return 1; if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) return 1; return 0; } EXPORT_SYMBOL(inode_needs_sync); /* * If we try to find an inode in the inode hash while it is being * deleted, we have to wait until the filesystem completes its * deletion before reporting that it isn't found. This function waits * until the deletion _might_ have completed. Callers are responsible * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; /* * Handle racing against evict(), see that routine for more details. */ if (unlikely(inode_unhashed(inode))) { WARN_ON(is_inode_hash_locked); spin_unlock(&inode->i_lock); return; } wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq_head, &wqe.wq_entry); if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); } static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { if (!str) return 0; ihash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("ihash_entries=", set_ihash_entries); /* * Initialize the waitqueues and inode hash table. */ void __init inode_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void __init inode_init(void) { /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ if (!hashdist) return; inode_hashtable = alloc_large_system_hash("Inode-cache", sizeof(struct hlist_head), ihash_entries, 14, HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; if (S_ISCHR(mode)) { inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; } else if (S_ISBLK(mode)) { if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) ; /* leave it no_open_fops */ else printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); } EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards * @idmap: idmap of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode * * If the inode has been created through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions * and initializing i_uid and i_gid. On non-idmapped mounts or if permission * checking is to be performed on the raw inode simply pass @nop_mnt_idmap. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode) { inode_fsuid_set(inode, idmap); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; /* Directories are special, and always inherit S_ISGID */ if (S_ISDIR(mode)) mode |= S_ISGID; } else inode_fsgid_set(inode, idmap); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode * @idmap: idmap of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) { vfsuid_t vfsuid; struct user_namespace *ns; vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return true; ns = current_user_ns(); if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } EXPORT_SYMBOL(inode_owner_or_capable); /* * Direct i/o helper functions */ bool inode_dio_finished(const struct inode *inode) { return atomic_read(&inode->i_dio_count) == 0; } EXPORT_SYMBOL(inode_dio_finished); /** * inode_dio_wait - wait for outstanding DIO requests to finish * @inode: inode to wait for * * Waits for all pending direct I/O requests to finish so that we can * proceed with a truncate or equivalent operation. * * Must be called under a lock that serializes taking new references * to i_dio_count, usually by inode->i_mutex. */ void inode_dio_wait(struct inode *inode) { wait_var_event(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait); void inode_dio_wait_interruptible(struct inode *inode) { wait_var_event_interruptible(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait_interruptible); /* * inode_set_flags - atomically set some inode flags * * Note: the caller should be holding i_mutex, or else be sure that * they have exclusive access to the inode structure (i.e., while the * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one * code path which doesn't today so we use cmpxchg() out of an abundance * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure * it is so documented in include/linux/fs.h and that all code follows * the locking convention!! */ void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { WARN_ON_ONCE(flags & ~mask); set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); void inode_nohighmem(struct inode *inode) { mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } EXPORT_SYMBOL(inode_nohighmem); struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { trace_inode_set_ctime_to_ts(inode, &ts); set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); inode->i_ctime_sec = ts.tv_sec; inode->i_ctime_nsec = ts.tv_nsec; return ts; } EXPORT_SYMBOL(inode_set_ctime_to_ts); /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec * @inode: inode being updated * * Truncate a timespec to the granularity supported by the fs * containing the inode. Always rounds down. gran must * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). */ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned int gran = sb->s_time_gran; t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) t.tv_nsec = 0; /* Avoid division in the common cases 1 ns and 1 s. */ if (gran == 1) ; /* nothing */ else if (gran == NSEC_PER_SEC) t.tv_nsec = 0; else if (gran > 1 && gran < NSEC_PER_SEC) t.tv_nsec -= t.tv_nsec % gran; else WARN(1, "invalid file time granularity: %u", gran); return t; } EXPORT_SYMBOL(timestamp_truncate); /** * inode_set_ctime_current - set the ctime to current_time * @inode: inode * * Set the inode's ctime to the current value for the inode. Returns the * current value that was assigned. If this is not a multigrain inode, then we * set it to the later of the coarse time and floor value. * * If it is multigrain, then we first see if the coarse-grained timestamp is * distinct from what is already there. If so, then use that. Otherwise, get a * fine-grained timestamp. * * After that, try to swap the new value into i_ctime_nsec. Accept the * resulting ctime, regardless of the outcome of the swap. If it has * already been replaced, then that timestamp is later than the earlier * unacceptable one, and is thus acceptable. */ struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now; u32 cns, cur; ktime_get_coarse_real_ts64_mg(&now); now = timestamp_truncate(now, inode); /* Just return that if this is not a multigrain fs */ if (!is_mgtime(inode)) { inode_set_ctime_to_ts(inode, now); goto out; } /* * A fine-grained time is only needed if someone has queried * for timestamps, and the current coarse grained time isn't * later than what's already there. */ cns = smp_load_acquire(&inode->i_ctime_nsec); if (cns & I_CTIME_QUERIED) { struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec, .tv_nsec = cns & ~I_CTIME_QUERIED }; if (timespec64_compare(&now, &ctime) <= 0) { ktime_get_real_ts64_mg(&now); now = timestamp_truncate(now, inode); mgtime_counter_inc(mg_fine_stamps); } } mgtime_counter_inc(mg_ctime_updates); /* No need to cmpxchg if it's exactly the same */ if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) { trace_ctime_xchg_skip(inode, &now); goto out; } cur = cns; retry: /* Try to swap the nsec value into place. */ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) { /* If swap occurred, then we're (mostly) done */ inode->i_ctime_sec = now.tv_sec; trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur); mgtime_counter_inc(mg_ctime_swaps); } else { /* * Was the change due to someone marking the old ctime QUERIED? * If so then retry the swap. This can only happen once since * the only way to clear I_CTIME_QUERIED is to stamp the inode * with a new ctime. */ if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) { cns = cur; goto retry; } /* Otherwise, keep the existing ctime */ now.tv_sec = inode->i_ctime_sec; now.tv_nsec = cur & ~I_CTIME_QUERIED; } out: return now; } EXPORT_SYMBOL(inode_set_ctime_current); /** * inode_set_ctime_deleg - try to update the ctime on a delegated inode * @inode: inode to update * @update: timespec64 to set the ctime * * Attempt to atomically update the ctime on behalf of a delegation holder. * * The nfs server can call back the holder of a delegation to get updated * inode attributes, including the mtime. When updating the mtime, update * the ctime to a value at least equal to that. * * This can race with concurrent updates to the inode, in which * case the update is skipped. * * Note that this works even when multigrain timestamps are not enabled, * so it is used in either case. */ struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update) { struct timespec64 now, cur_ts; u32 cur, old; /* pairs with try_cmpxchg below */ cur = smp_load_acquire(&inode->i_ctime_nsec); cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; cur_ts.tv_sec = inode->i_ctime_sec; /* If the update is older than the existing value, skip it. */ if (timespec64_compare(&update, &cur_ts) <= 0) return cur_ts; ktime_get_coarse_real_ts64_mg(&now); /* Clamp the update to "now" if it's in the future */ if (timespec64_compare(&update, &now) > 0) update = now; update = timestamp_truncate(update, inode); /* No need to update if the values are already the same */ if (timespec64_equal(&update, &cur_ts)) return cur_ts; /* * Try to swap the nsec value into place. If it fails, that means * it raced with an update due to a write or similar activity. That * stamp takes precedence, so just skip the update. */ retry: old = cur; if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) { inode->i_ctime_sec = update.tv_sec; mgtime_counter_inc(mg_ctime_swaps); return update; } /* * Was the change due to another task marking the old ctime QUERIED? * * If so, then retry the swap. This can only happen once since * the only way to clear I_CTIME_QUERIED is to stamp the inode * with a new ctime. */ if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED))) goto retry; /* Otherwise, it was a new timestamp. */ cur_ts.tv_sec = inode->i_ctime_sec; cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED; return cur_ts; } EXPORT_SYMBOL(inode_set_ctime_deleg); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged * @idmap: idmap of the mount @inode was found from * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * * Return: true if the caller is sufficiently privileged, false if not. */ bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid) { if (vfsgid_in_group_p(vfsgid)) return true; if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) return true; return false; } EXPORT_SYMBOL(in_group_or_capable); /** * mode_strip_sgid - handle the sgid bit for non-directories * @idmap: idmap of the mount the inode was created from * @dir: parent directory inode * @mode: mode of the file to be created in @dir * * If the @mode of the new file has both the S_ISGID and S_IXGRP bit * raised and @dir has the S_ISGID bit raised ensure that the caller is * either in the group of the parent directory or they have CAP_FSETID * in their user namespace and are privileged over the parent directory. * In all other cases, strip the S_ISGID bit from @mode. * * Return: the new mode to use for the file */ umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode) { if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) return mode; if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) return mode; if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir))) return mode; return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid);
54 52 55 55 396 396 84 84 39 74 56 52 64 21 21 21 21 21 74 74 62 58 36 27 36 11 74 188 36 2 36 3 36 36 36 36 39 36 36 36 39 36 36 13 26 36 36 36 2 36 36 3 3 36 36 62 62 62 62 35 35 35 16 62 62 23 2 51 52 52 52 52 52 52 52 52 52 52 52 52 52 52 51 62 62 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 45 45 45 45 45 45 45 45 3 2 2 2 2 17 5 17 17 17 17 17 17 17 17 14 12 1 1 1 1 1 12 12 2 1 17 1 3 3 17 17 52 51 51 51 51 1 16 16 16 52 52 52 52 19 52 19 19 2 2 2 2 2 2 19 19 19 19 17 7 7 7 7 52 52 52 52 52 52 2 52 52 96 21 21 2 19 3 74 75 75 77 96 169 169 86 91 169 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/bio.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/sched/mm.h> #include <crypto/hash.h> #include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "bio.h" #include "compression.h" #include "fs.h" #include "accessors.h" #include "file-item.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ size) - 1)) #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) /* * Set inode's size according to filesystem options. * * @inode: inode we want to update the disk_i_size for * @new_i_size: i_size we want to set to, 0 if we use i_size * * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() * returns as it is perfectly fine with a file that has holes without hole file * extent items. * * However without NO_HOLES we need to only return the area that is contiguous * from the 0 offset of the file. Otherwise we could end up adjust i_size up * to an extent that has a gap in between. * * Finally new_i_size should only be set in the case of truncate where we're not * ready to use i_size_read() as the limiter yet. */ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { u64 start, end, i_size; int ret; spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); if (!inode->file_extent_tree) { inode->disk_i_size = i_size; goto out_unlock; } ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, &end, EXTENT_DIRTY); if (!ret && start == 0) i_size = min(i_size, end + 1); else i_size = 0; inode->disk_i_size = i_size; out_unlock: spin_unlock(&inode->lock); } /* * Mark range within a file as having a new extent inserted. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted * @len: logical length of the file extent item * * Call when we are inserting a new file extent where there was none before. * Does not need to call this in the case where we're replacing an existing file * extent, however if not sure it's fine to call this multiple times. * * The start and len must match the file extent item, so thus must be sectorsize * aligned. */ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len) { if (!inode->file_extent_tree) return 0; if (len == 0) return 0; ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); return set_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } /* * Mark an inode range as not having a backing extent. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted * @len: logical length of the file extent item * * Called when we drop a file extent, for example when we truncate. Doesn't * need to be called for cases where we're replacing a file extent, like when * we've COWed a file extent. * * The start and len must match the file extent item, so thus must be sectorsize * aligned. */ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len) { if (!inode->file_extent_tree) return 0; if (len == 0) return 0; ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || len == (u64)-1); return clear_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) { ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize)); return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size; } static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size) { ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size)); return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits; } static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) { u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum), fs_info->csum_size); return csum_size_to_bytes(fs_info, max_csum_size); } /* * Calculate the total size needed to allocate for an ordered sum structure * spanning @bytes in the file. */ static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes) { return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes) { int ret = 0; struct btrfs_file_extent_item *item; struct btrfs_key file_key; struct btrfs_path *path; struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) return -ENOMEM; file_key.objectid = objectid; file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) goto out; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_disk_bytenr(leaf, item, 0); btrfs_set_file_extent_disk_num_bytes(leaf, item, 0); btrfs_set_file_extent_offset(leaf, item, 0); btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes); btrfs_set_file_extent_generation(leaf, item, trans->transid); btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); out: btrfs_free_path(path); return ret; } static struct btrfs_csum_item * btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 bytenr, int cow) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_key file_key; struct btrfs_key found_key; struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; const u32 csum_size = fs_info->csum_size; int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; leaf = path->nodes[0]; if (ret > 0) { ret = 1; if (path->slots[0] == 0) goto fail; path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.type != BTRFS_EXTENT_CSUM_KEY) goto fail; csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; csums_in_item = btrfs_item_size(leaf, path->slots[0]); csums_in_item /= csum_size; if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; } else if (csum_offset > csums_in_item) { goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); return item; fail: if (ret > 0) ret = -ENOENT; return ERR_PTR(ret); } int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 offset, int mod) { struct btrfs_key file_key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; file_key.objectid = objectid; file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } /* * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and * store the result to @dst. * * Return >0 for the number of sectors we found. * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum * for it. Caller may want to try next sector until one range is hit. * Return <0 for fatal error. */ static int search_csum_tree(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 disk_bytenr, u64 len, u8 *dst) { struct btrfs_root *csum_root; struct btrfs_csum_item *item = NULL; struct btrfs_key key; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 itemsize; int ret; u64 csum_start; u64 csum_len; ASSERT(IS_ALIGNED(disk_bytenr, sectorsize) && IS_ALIGNED(len, sectorsize)); /* Check if the current csum item covers disk_bytenr */ if (path->nodes[0]) { item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); csum_start = key.offset; csum_len = (itemsize / csum_size) * sectorsize; if (in_range(disk_bytenr, csum_start, csum_len)) goto found; } /* Current item doesn't contain the desired range, search again */ btrfs_release_path(path); csum_root = btrfs_csum_root(fs_info, disk_bytenr); item = btrfs_lookup_csum(NULL, csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { ret = PTR_ERR(item); goto out; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); itemsize = btrfs_item_size(path->nodes[0], path->slots[0]); csum_start = key.offset; csum_len = (itemsize / csum_size) * sectorsize; ASSERT(in_range(disk_bytenr, csum_start, csum_len)); found: ret = (min(csum_start + csum_len, disk_bytenr + len) - disk_bytenr) >> fs_info->sectorsize_bits; read_extent_buffer(path->nodes[0], dst, (unsigned long)item, ret * csum_size); out: if (ret == -ENOENT || ret == -EFBIG) ret = 0; return ret; } /* * Lookup the checksum for the read bio in csum tree. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; blk_status_t ret = BLK_STS_OK; u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) return BLK_STS_OK; /* * This function is only called for read bio. * * This means two things: * - All our csums should only be in csum tree * No ordered extents csums, as ordered extents are only for write * path. * - No need to bother any other info from bvec * Since we're looking up csums, the only important info is the * disk_bytenr and the length, which can be extracted from bi_iter * directly. */ ASSERT(bio_op(bio) == REQ_OP_READ); path = btrfs_alloc_path(); if (!path) return BLK_STS_RESOURCE; if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); if (!bbio->csum) { btrfs_free_path(path); return BLK_STS_RESOURCE; } } else { bbio->csum = bbio->csum_inline; } /* * If requested number of sectors is larger than one leaf can contain, * kick the readahead for csum tree. */ if (nblocks > fs_info->csums_per_leaf) path->reada = READA_FORWARD; /* * the free space stuff is only read when it hasn't been * updated in the current transaction. So, we can safely * read from the commit root and sidestep a nasty deadlock * between reading the free space cache and updating the csum tree. */ if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } while (bio_offset < orig_len) { int count; u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; u8 *csum_dst = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * csum_size; count = search_csum_tree(fs_info, path, cur_disk_bytenr, orig_len - bio_offset, csum_dst); if (count < 0) { ret = errno_to_blk_status(count); if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); bbio->csum = NULL; break; } /* * We didn't find a csum for this range. We need to make sure * we complain loudly about this, because we are not NODATASUM. * * However for the DATA_RELOC inode we could potentially be * relocating data extents for a NODATASUM inode, so the inode * itself won't be marked with NODATASUM, but the extent we're * copying is in fact NODATASUM. If we don't find a csum we * assume this is the case. */ if (count == 0) { memset(csum_dst, 0, csum_size); count = 1; if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; set_extent_bit(&inode->io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM, NULL); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", cur_disk_bytenr, cur_disk_bytenr + sectorsize); } } bio_offset += count * sectorsize; } btrfs_free_path(path); return ret; } /* * Search for checksums for a given logical range. * * @root: The root where to look for checksums. * @start: Logical address of target checksum range. * @end: End offset (inclusive) of the target checksum range. * @list: List for adding each checksum that was found. * Can be NULL in case the caller only wants to check if * there any checksums for the range. * @nowait: Indicate if the search must be non-blocking or not. * * Return < 0 on error, 0 if no checksums were found, or 1 if checksums were * found. */ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; struct btrfs_csum_item *item; int ret; bool found_csums = false; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->nowait = nowait; key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.offset = start; key.type = BTRFS_EXTENT_CSUM_KEY; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); /* * There are two cases we can hit here for the previous csum * item: * * |<- search range ->| * |<- csum item ->| * * Or * |<- search range ->| * |<- csum item ->| * * Check if the previous csum item covers the leading part of * the search range. If so we have to start from previous csum * item. */ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { if (bytes_to_csum_size(fs_info, start - key.offset) < btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } while (start <= end) { u64 csum_end; leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; if (ret > 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset > end) break; if (key.offset > start) start = key.offset; csum_end = key.offset + csum_size_to_bytes(fs_info, btrfs_item_size(leaf, path->slots[0])); if (csum_end <= start) { path->slots[0]++; continue; } found_csums = true; if (!list) goto out; csum_end = min(csum_end, end + 1); item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); while (start < csum_end) { unsigned long offset; size_t size; size = min_t(size_t, csum_end - start, max_ordered_sum_bytes(fs_info)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { ret = -ENOMEM; goto out; } sums->logical = start; sums->len = size; offset = bytes_to_csum_size(fs_info, start - key.offset); read_extent_buffer(path->nodes[0], sums->sums, ((unsigned long)item) + offset, bytes_to_csum_size(fs_info, size)); start += size; list_add_tail(&sums->list, list); } path->slots[0]++; } out: btrfs_free_path(path); if (ret < 0) { if (list) { struct btrfs_ordered_sum *tmp_sums; list_for_each_entry_safe(sums, tmp_sums, list, list) kfree(sums); } return ret; } return found_csums ? 1 : 0; } /* * Do the same work as btrfs_lookup_csums_list(), the difference is in how * we return the result. * * This version will set the corresponding bits in @csum_bitmap to represent * that there is a csum found. * Each bit represents a sector. Thus caller should ensure @csum_buf passed * in is large enough to contain all csums. */ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, u64 start, u64 end, u8 *csum_buf, unsigned long *csum_bitmap) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_csum_item *item; const u64 orig_start = start; bool free_path = false; int ret; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); if (!path) { path = btrfs_alloc_path(); if (!path) return -ENOMEM; free_path = true; } /* Check if we can reuse the previous path. */ if (path->nodes[0]) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY && key.offset <= start) goto search_forward; btrfs_release_path(path); } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto fail; if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); /* * There are two cases we can hit here for the previous csum * item: * * |<- search range ->| * |<- csum item ->| * * Or * |<- search range ->| * |<- csum item ->| * * Check if the previous csum item covers the leading part of * the search range. If so we have to start from previous csum * item. */ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { if (bytes_to_csum_size(fs_info, start - key.offset) < btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } search_forward: while (start <= end) { u64 csum_end; leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto fail; if (ret > 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || key.type != BTRFS_EXTENT_CSUM_KEY || key.offset > end) break; if (key.offset > start) start = key.offset; csum_end = key.offset + csum_size_to_bytes(fs_info, btrfs_item_size(leaf, path->slots[0])); if (csum_end <= start) { path->slots[0]++; continue; } csum_end = min(csum_end, end + 1); item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); while (start < csum_end) { unsigned long offset; size_t size; u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info, start - orig_start); size = min_t(size_t, csum_end - start, end + 1 - start); offset = bytes_to_csum_size(fs_info, start - key.offset); read_extent_buffer(path->nodes[0], csum_dest, ((unsigned long)item) + offset, bytes_to_csum_size(fs_info, size)); bitmap_set(csum_bitmap, (start - orig_start) >> fs_info->sectorsize_bits, size >> fs_info->sectorsize_bits); start += size; } path->slots[0]++; } ret = 0; fail: if (free_path) btrfs_free_path(path); return ret; } /* * Calculate checksums of the data contained inside a bio. */ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; struct btrfs_ordered_sum *sums; char *data; struct bvec_iter iter; struct bio_vec bvec; int index; unsigned int blockcount; int i; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!sums) return BLK_STS_RESOURCE; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; index = 0; shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { blockcount = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); for (i = 0; i < blockcount; i++) { data = bvec_kmap_local(&bvec); crypto_shash_digest(shash, data + (i * fs_info->sectorsize), fs_info->sectorsize, sums->sums + index); kunmap_local(data); index += fs_info->csum_size; } } bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); return 0; } /* * Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to * record the updated logical address on Zone Append completion. * Allocate just the structure with an empty sums array here for that case. */ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) { bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); if (!bbio->sums) return BLK_STS_RESOURCE; bbio->sums->len = bbio->bio.bi_iter.bi_size; bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; btrfs_add_ordered_sum(bbio->ordered, bbio->sums); return 0; } /* * Remove one checksum overlapping a range. * * This expects the key to describe the csum pointed to by the path, and it * expects the csum to overlap the range [bytenr, len] * * The csum should not be entirely contained in the range and the range should * not be entirely contained in the csum. * * This calls btrfs_truncate_item with the correct args based on the overlap, * and fixes up the key as required. */ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *key, u64 bytenr, u64 len) { struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *leaf; const u32 csum_size = fs_info->csum_size; u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = fs_info->sectorsize_bits; leaf = path->nodes[0]; csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; csum_end <<= blocksize_bits; csum_end += key->offset; if (key->offset < bytenr && csum_end <= end_byte) { /* * [ bytenr - len ] * [ ] * [csum ] * A simple truncate off the end of the item */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; btrfs_truncate_item(trans, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* * [ bytenr - len ] * [ ] * [csum ] * we need to truncate from the beginning of the csum */ u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; btrfs_truncate_item(trans, path, new_size, 0); key->offset = end_byte; btrfs_set_item_key_safe(trans, path, key); } else { BUG(); } } /* * Delete the csum items from the csum tree for a given range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; struct btrfs_key key; u64 end_byte = bytenr + len; u64 csum_end; struct extent_buffer *leaf; int ret = 0; const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; ASSERT(btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID || btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); path = btrfs_alloc_path(); if (!path) return -ENOMEM; while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; } else if (ret < 0) { break; } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || key.type != BTRFS_EXTENT_CSUM_KEY) { break; } if (key.offset >= end_byte) break; csum_end = btrfs_item_size(leaf, path->slots[0]) / csum_size; csum_end <<= blocksize_bits; csum_end += key.offset; /* this csum ends before we start, we're done */ if (csum_end <= bytenr) break; /* delete the entire item, it is inside our range */ if (key.offset >= bytenr && csum_end <= end_byte) { int del_nr = 1; /* * Check how many csum items preceding this one in this * leaf correspond to our range and then delete them all * at once. */ if (key.offset > bytenr && path->slots[0] > 0) { int slot = path->slots[0] - 1; while (slot >= 0) { struct btrfs_key pk; btrfs_item_key_to_cpu(leaf, &pk, slot); if (pk.offset < bytenr || pk.type != BTRFS_EXTENT_CSUM_KEY || pk.objectid != BTRFS_EXTENT_CSUM_OBJECTID) break; path->slots[0] = slot; del_nr++; key.offset = pk.offset; slot--; } } ret = btrfs_del_items(trans, root, path, path->slots[0], del_nr); if (ret) break; if (key.offset == bytenr) break; } else if (key.offset < bytenr && csum_end > end_byte) { unsigned long offset; unsigned long shift_len; unsigned long item_offset; /* * [ bytenr - len ] * [csum ] * * Our bytes are in the middle of the csum, * we need to split this item and insert a new one. * * But we can't drop the path because the * csum could change, get removed, extended etc. * * The trick here is the max size of a csum item leaves * enough room in the tree block for a single * item header. So, we split the item in place, * adding a new header pointing to the existing * bytes. Then we loop around again and we have * a nicely formed csum item that we can neatly * truncate. */ offset = (bytenr - key.offset) >> blocksize_bits; offset *= csum_size; shift_len = (len >> blocksize_bits) * csum_size; item_offset = btrfs_item_ptr_offset(leaf, path->slots[0]); memzero_extent_buffer(leaf, item_offset + offset, shift_len); key.offset = bytenr; /* * btrfs_split_item returns -EAGAIN when the * item changed size or key */ ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { btrfs_abort_transaction(trans, ret); break; } ret = 0; key.offset = end_byte - 1; } else { truncate_one_csum(trans, path, &key, bytenr, len); if (key.offset < bytenr) break; } btrfs_release_path(path); } btrfs_free_path(path); return ret; } static int find_next_csum_offset(struct btrfs_root *root, struct btrfs_path *path, u64 *next_offset) { const u32 nritems = btrfs_header_nritems(path->nodes[0]); struct btrfs_key found_key; int slot = path->slots[0] + 1; int ret; if (nritems == 0 || slot >= nritems) { ret = btrfs_next_leaf(root, path); if (ret < 0) { return ret; } else if (ret > 0) { *next_offset = (u64)-1; return 0; } slot = path->slots[0]; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || found_key.type != BTRFS_EXTENT_CSUM_KEY) *next_offset = (u64)-1; else *next_offset = found_key.offset; return 0; } int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key file_key; struct btrfs_key found_key; struct btrfs_path *path; struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; u64 next_offset; u64 total_bytes = 0; u64 csum_offset; u64 bytenr; u32 ins_size; int index = 0; int found_next; int ret; const u32 csum_size = fs_info->csum_size; path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: next_offset = (u64)-1; found_next = 0; bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { ret = 0; leaf = path->nodes[0]; item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((char *)item_end + btrfs_item_size(leaf, path->slots[0])); goto found; } ret = PTR_ERR(item); if (ret != -EFBIG && ret != -ENOENT) goto out; if (ret == -EFBIG) { u32 item_size; /* we found one, but it isn't big enough yet */ leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if ((item_size / csum_size) >= MAX_CSUM_ITEMS(fs_info, csum_size)) { /* already at max size, make a new one */ goto insert; } } else { /* We didn't find a csum item, insert one. */ ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) goto out; found_next = 1; goto insert; } /* * At this point, we know the tree has a checksum item that ends at an * offset matching the start of the checksum range we want to insert. * We try to extend that item as much as possible and then add as many * checksums to it as they fit. * * First check if the leaf has enough free space for at least one * checksum. If it has go directly to the item extension code, otherwise * release the path and do a search for insertion before the extension. */ if (btrfs_leaf_free_space(leaf) >= csum_size) { btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; goto extend_csum; } btrfs_release_path(path); path->search_for_extension = 1; ret = btrfs_search_slot(trans, root, &file_key, path, csum_size, 1); path->search_for_extension = 0; if (ret < 0) goto out; if (ret > 0) { if (path->slots[0] == 0) goto insert; path->slots[0]--; } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); csum_offset = (bytenr - found_key.offset) >> fs_info->sectorsize_bits; if (found_key.type != BTRFS_EXTENT_CSUM_KEY || found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || csum_offset >= MAX_CSUM_ITEMS(fs_info, csum_size)) { goto insert; } extend_csum: if (csum_offset == btrfs_item_size(leaf, path->slots[0]) / csum_size) { int extend_nr; u64 tmp; u32 diff; tmp = sums->len - total_bytes; tmp >>= fs_info->sectorsize_bits; WARN_ON(tmp < 1); extend_nr = max_t(int, 1, tmp); /* * A log tree can already have checksum items with a subset of * the checksums we are trying to log. This can happen after * doing a sequence of partial writes into prealloc extents and * fsyncs in between, with a full fsync logging a larger subrange * of an extent for which a previous fast fsync logged a smaller * subrange. And this happens in particular due to merging file * extent items when we complete an ordered extent for a range * covered by a prealloc extent - this is done at * btrfs_mark_extent_written(). * * So if we try to extend the previous checksum item, which has * a range that ends at the start of the range we want to insert, * make sure we don't extend beyond the start offset of the next * checksum item. If we are at the last item in the leaf, then * forget the optimization of extending and add a new checksum * item - it is not worth the complexity of releasing the path, * getting the first key for the next leaf, repeat the btree * search, etc, because log trees are temporary anyway and it * would only save a few bytes of leaf space. */ if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { if (path->slots[0] + 1 >= btrfs_header_nritems(path->nodes[0])) { ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) goto out; found_next = 1; goto insert; } ret = find_next_csum_offset(root, path, &next_offset); if (ret < 0) goto out; tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; if (tmp <= INT_MAX) extend_nr = min_t(int, extend_nr, tmp); } diff = (csum_offset + extend_nr) * csum_size; diff = min(diff, MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); diff = diff - btrfs_item_size(leaf, path->slots[0]); diff = min_t(u32, btrfs_leaf_free_space(leaf), diff); diff /= csum_size; diff *= csum_size; btrfs_extend_item(trans, path, diff); ret = 0; goto csum; } insert: btrfs_release_path(path); csum_offset = 0; if (found_next) { u64 tmp; tmp = sums->len - total_bytes; tmp >>= fs_info->sectorsize_bits; tmp = min(tmp, (next_offset - file_key.offset) >> fs_info->sectorsize_bits); tmp = max_t(u64, 1, tmp); tmp = min_t(u64, tmp, MAX_CSUM_ITEMS(fs_info, csum_size)); ins_size = csum_size * tmp; } else { ins_size = csum_size; } ret = btrfs_insert_empty_item(trans, root, path, &file_key, ins_size); if (ret < 0) goto out; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); item_end = (struct btrfs_csum_item *)((unsigned char *)item + btrfs_item_size(leaf, path->slots[0])); item = (struct btrfs_csum_item *)((unsigned char *)item + csum_offset * csum_size); found: ins_size = (u32)(sums->len - total_bytes) >> fs_info->sectorsize_bits; ins_size *= csum_size; ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, ins_size); write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, ins_size); index += ins_size; ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; if (total_bytes < sums->len) { btrfs_release_path(path); cond_resched(); goto again; } out: btrfs_free_path(path); return ret; } void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, const struct btrfs_file_extent_item *fi, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; struct btrfs_key key; u64 extent_start; u8 type = btrfs_file_extent_type(leaf, fi); int compress_type = btrfs_file_extent_compression(leaf, fi); btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); em->generation = btrfs_file_extent_generation(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { const u64 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); em->start = extent_start; em->len = btrfs_file_extent_end(path) - extent_start; if (disk_bytenr == 0) { em->disk_bytenr = EXTENT_MAP_HOLE; em->disk_num_bytes = 0; em->offset = 0; return; } em->disk_bytenr = disk_bytenr; em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); em->offset = btrfs_file_extent_offset(leaf, fi); if (compress_type != BTRFS_COMPRESS_NONE) { extent_map_set_compression(em, compress_type); } else { /* * Older kernels can create regular non-hole data * extents with ram_bytes smaller than disk_num_bytes. * Not a big deal, just always use disk_num_bytes * for ram_bytes. */ em->ram_bytes = em->disk_num_bytes; if (type == BTRFS_FILE_EXTENT_PREALLOC) em->flags |= EXTENT_FLAG_PREALLOC; } } else if (type == BTRFS_FILE_EXTENT_INLINE) { /* Tree-checker has ensured this. */ ASSERT(extent_start == 0); em->disk_bytenr = EXTENT_MAP_INLINE; em->start = 0; em->len = fs_info->sectorsize; em->offset = 0; extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " "root %llu", type, btrfs_ino(inode), extent_start, btrfs_root_id(root)); } } /* * Returns the end offset (non inclusive) of the file extent item the given path * points to. If it points to an inline extent, the returned offset is rounded * up to the sector size. */ u64 btrfs_file_extent_end(const struct btrfs_path *path) { const struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 end; btrfs_item_key_to_cpu(leaf, &key, slot); ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) end = leaf->fs_info->sectorsize; else end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); return end; }
5 6 5 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/common.c - Common device power management code. * * Copyright (C) 2011 Rafael J. Wysocki <rjw@sisk.pl>, Renesas Electronics Corp. */ #include <linux/kernel.h> #include <linux/device.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/pm_clock.h> #include <linux/acpi.h> #include <linux/pm_domain.h> #include <linux/pm_opp.h> #include "power.h" /** * dev_pm_get_subsys_data - Create or refcount power.subsys_data for device. * @dev: Device to handle. * * If power.subsys_data is NULL, point it to a new object, otherwise increment * its reference counter. Return 0 if new object has been created or refcount * increased, otherwise negative error code. */ int dev_pm_get_subsys_data(struct device *dev) { struct pm_subsys_data *psd; psd = kzalloc(sizeof(*psd), GFP_KERNEL); if (!psd) return -ENOMEM; spin_lock_irq(&dev->power.lock); if (dev->power.subsys_data) { dev->power.subsys_data->refcount++; } else { spin_lock_init(&psd->lock); psd->refcount = 1; dev->power.subsys_data = psd; pm_clk_init(dev); psd = NULL; } spin_unlock_irq(&dev->power.lock); /* kfree() verifies that its argument is nonzero. */ kfree(psd); return 0; } EXPORT_SYMBOL_GPL(dev_pm_get_subsys_data); /** * dev_pm_put_subsys_data - Drop reference to power.subsys_data. * @dev: Device to handle. * * If the reference counter of power.subsys_data is zero after dropping the * reference, power.subsys_data is removed. */ void dev_pm_put_subsys_data(struct device *dev) { struct pm_subsys_data *psd; spin_lock_irq(&dev->power.lock); psd = dev_to_psd(dev); if (!psd) goto out; if (--psd->refcount == 0) dev->power.subsys_data = NULL; else psd = NULL; out: spin_unlock_irq(&dev->power.lock); kfree(psd); } EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data); /** * dev_pm_domain_attach - Attach a device to its PM domain. * @dev: Device to attach. * @power_on: Used to indicate whether we should power on the device. * * The @dev may only be attached to a single PM domain. By iterating through * the available alternatives we try to find a valid PM domain for the device. * As attachment succeeds, the ->detach() callback in the struct dev_pm_domain * should be assigned by the corresponding attach function. * * This function should typically be invoked from subsystem level code during * the probe phase. Especially for those that holds devices which requires * power management through PM domains. * * Callers must ensure proper synchronization of this function with power * management callbacks. * * Returns 0 on successfully attached PM domain, or when it is found that the * device doesn't need a PM domain, else a negative error code. */ int dev_pm_domain_attach(struct device *dev, bool power_on) { int ret; if (dev->pm_domain) return 0; ret = acpi_dev_pm_attach(dev, power_on); if (!ret) ret = genpd_dev_pm_attach(dev); return ret < 0 ? ret : 0; } EXPORT_SYMBOL_GPL(dev_pm_domain_attach); /** * dev_pm_domain_attach_by_id - Associate a device with one of its PM domains. * @dev: The device used to lookup the PM domain. * @index: The index of the PM domain. * * As @dev may only be attached to a single PM domain, the backend PM domain * provider creates a virtual device to attach instead. If attachment succeeds, * the ->detach() callback in the struct dev_pm_domain are assigned by the * corresponding backend attach function, as to deal with detaching of the * created virtual device. * * This function should typically be invoked by a driver during the probe phase, * in case its device requires power management through multiple PM domains. The * driver may benefit from using the received device, to configure device-links * towards its original device. Depending on the use-case and if needed, the * links may be dynamically changed by the driver, which allows it to control * the power to the PM domains independently from each other. * * Callers must ensure proper synchronization of this function with power * management callbacks. * * Returns the virtual created device when successfully attached to its PM * domain, NULL in case @dev don't need a PM domain, else an ERR_PTR(). * Note that, to detach the returned virtual device, the driver shall call * dev_pm_domain_detach() on it, typically during the remove phase. */ struct device *dev_pm_domain_attach_by_id(struct device *dev, unsigned int index) { if (dev->pm_domain) return ERR_PTR(-EEXIST); return genpd_dev_pm_attach_by_id(dev, index); } EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_id); /** * dev_pm_domain_attach_by_name - Associate a device with one of its PM domains. * @dev: The device used to lookup the PM domain. * @name: The name of the PM domain. * * For a detailed function description, see dev_pm_domain_attach_by_id(). */ struct device *dev_pm_domain_attach_by_name(struct device *dev, const char *name) { if (dev->pm_domain) return ERR_PTR(-EEXIST); return genpd_dev_pm_attach_by_name(dev, name); } EXPORT_SYMBOL_GPL(dev_pm_domain_attach_by_name); /** * dev_pm_domain_attach_list - Associate a device with its PM domains. * @dev: The device used to lookup the PM domains for. * @data: The data used for attaching to the PM domains. * @list: An out-parameter with an allocated list of attached PM domains. * * This function helps to attach a device to its multiple PM domains. The * caller, which is typically a driver's probe function, may provide a list of * names for the PM domains that we should try to attach the device to, but it * may also provide an empty list, in case the attach should be done for all of * the available PM domains. * * Callers must ensure proper synchronization of this function with power * management callbacks. * * Returns the number of attached PM domains or a negative error code in case of * a failure. Note that, to detach the list of PM domains, the driver shall call * dev_pm_domain_detach_list(), typically during the remove phase. */ int dev_pm_domain_attach_list(struct device *dev, const struct dev_pm_domain_attach_data *data, struct dev_pm_domain_list **list) { struct device_node *np = dev->of_node; struct dev_pm_domain_list *pds; struct device *pd_dev = NULL; int ret, i, num_pds = 0; bool by_id = true; size_t size; u32 pd_flags = data ? data->pd_flags : 0; u32 link_flags = pd_flags & PD_FLAG_NO_DEV_LINK ? 0 : DL_FLAG_STATELESS | DL_FLAG_PM_RUNTIME; if (dev->pm_domain) return -EEXIST; /* For now this is limited to OF based platforms. */ if (!np) return 0; if (data && data->pd_names) { num_pds = data->num_pd_names; by_id = false; } else { num_pds = of_count_phandle_with_args(np, "power-domains", "#power-domain-cells"); } if (num_pds <= 0) return 0; pds = kzalloc(sizeof(*pds), GFP_KERNEL); if (!pds) return -ENOMEM; size = sizeof(*pds->pd_devs) + sizeof(*pds->pd_links) + sizeof(*pds->opp_tokens); pds->pd_devs = kcalloc(num_pds, size, GFP_KERNEL); if (!pds->pd_devs) { ret = -ENOMEM; goto free_pds; } pds->pd_links = (void *)(pds->pd_devs + num_pds); pds->opp_tokens = (void *)(pds->pd_links + num_pds); if (link_flags && pd_flags & PD_FLAG_DEV_LINK_ON) link_flags |= DL_FLAG_RPM_ACTIVE; for (i = 0; i < num_pds; i++) { if (by_id) pd_dev = dev_pm_domain_attach_by_id(dev, i); else pd_dev = dev_pm_domain_attach_by_name(dev, data->pd_names[i]); if (IS_ERR_OR_NULL(pd_dev)) { ret = pd_dev ? PTR_ERR(pd_dev) : -ENODEV; goto err_attach; } if (pd_flags & PD_FLAG_REQUIRED_OPP) { struct dev_pm_opp_config config = { .required_dev = pd_dev, .required_dev_index = i, }; ret = dev_pm_opp_set_config(dev, &config); if (ret < 0) goto err_link; pds->opp_tokens[i] = ret; } if (link_flags) { struct device_link *link; link = device_link_add(dev, pd_dev, link_flags); if (!link) { ret = -ENODEV; goto err_link; } pds->pd_links[i] = link; } pds->pd_devs[i] = pd_dev; } pds->num_pds = num_pds; *list = pds; return num_pds; err_link: dev_pm_opp_clear_config(pds->opp_tokens[i]); dev_pm_domain_detach(pd_dev, true); err_attach: while (--i >= 0) { dev_pm_opp_clear_config(pds->opp_tokens[i]); if (pds->pd_links[i]) device_link_del(pds->pd_links[i]); dev_pm_domain_detach(pds->pd_devs[i], true); } kfree(pds->pd_devs); free_pds: kfree(pds); return ret; } EXPORT_SYMBOL_GPL(dev_pm_domain_attach_list); /** * devm_pm_domain_detach_list - devres-enabled version of dev_pm_domain_detach_list. * @_list: The list of PM domains to detach. * * This function reverse the actions from devm_pm_domain_attach_list(). * it will be invoked during the remove phase from drivers implicitly if driver * uses devm_pm_domain_attach_list() to attach the PM domains. */ static void devm_pm_domain_detach_list(void *_list) { struct dev_pm_domain_list *list = _list; dev_pm_domain_detach_list(list); } /** * devm_pm_domain_attach_list - devres-enabled version of dev_pm_domain_attach_list * @dev: The device used to lookup the PM domains for. * @data: The data used for attaching to the PM domains. * @list: An out-parameter with an allocated list of attached PM domains. * * NOTE: this will also handle calling devm_pm_domain_detach_list() for * you during remove phase. * * Returns the number of attached PM domains or a negative error code in case of * a failure. */ int devm_pm_domain_attach_list(struct device *dev, const struct dev_pm_domain_attach_data *data, struct dev_pm_domain_list **list) { int ret, num_pds; num_pds = dev_pm_domain_attach_list(dev, data, list); if (num_pds <= 0) return num_pds; ret = devm_add_action_or_reset(dev, devm_pm_domain_detach_list, *list); if (ret) return ret; return num_pds; } EXPORT_SYMBOL_GPL(devm_pm_domain_attach_list); /** * dev_pm_domain_detach - Detach a device from its PM domain. * @dev: Device to detach. * @power_off: Used to indicate whether we should power off the device. * * This functions will reverse the actions from dev_pm_domain_attach(), * dev_pm_domain_attach_by_id() and dev_pm_domain_attach_by_name(), thus it * detaches @dev from its PM domain. Typically it should be invoked during the * remove phase, either from subsystem level code or from drivers. * * Callers must ensure proper synchronization of this function with power * management callbacks. */ void dev_pm_domain_detach(struct device *dev, bool power_off) { if (dev->pm_domain && dev->pm_domain->detach) dev->pm_domain->detach(dev, power_off); } EXPORT_SYMBOL_GPL(dev_pm_domain_detach); /** * dev_pm_domain_detach_list - Detach a list of PM domains. * @list: The list of PM domains to detach. * * This function reverse the actions from dev_pm_domain_attach_list(). * Typically it should be invoked during the remove phase from drivers. * * Callers must ensure proper synchronization of this function with power * management callbacks. */ void dev_pm_domain_detach_list(struct dev_pm_domain_list *list) { int i; if (!list) return; for (i = 0; i < list->num_pds; i++) { dev_pm_opp_clear_config(list->opp_tokens[i]); if (list->pd_links[i]) device_link_del(list->pd_links[i]); dev_pm_domain_detach(list->pd_devs[i], true); } kfree(list->pd_devs); kfree(list); } EXPORT_SYMBOL_GPL(dev_pm_domain_detach_list); /** * dev_pm_domain_start - Start the device through its PM domain. * @dev: Device to start. * * This function should typically be called during probe by a subsystem/driver, * when it needs to start its device from the PM domain's perspective. Note * that, it's assumed that the PM domain is already powered on when this * function is called. * * Returns 0 on success and negative error values on failures. */ int dev_pm_domain_start(struct device *dev) { if (dev->pm_domain && dev->pm_domain->start) return dev->pm_domain->start(dev); return 0; } EXPORT_SYMBOL_GPL(dev_pm_domain_start); /** * dev_pm_domain_set - Set PM domain of a device. * @dev: Device whose PM domain is to be set. * @pd: PM domain to be set, or NULL. * * Sets the PM domain the device belongs to. The PM domain of a device needs * to be set before its probe finishes (it's bound to a driver). * * This function must be called with the device lock held. */ void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd) { if (dev->pm_domain == pd) return; WARN(pd && device_is_bound(dev), "PM domains can only be changed for unbound devices\n"); dev->pm_domain = pd; device_pm_check_callbacks(dev); } EXPORT_SYMBOL_GPL(dev_pm_domain_set); /** * dev_pm_domain_set_performance_state - Request a new performance state. * @dev: The device to make the request for. * @state: Target performance state for the device. * * This function should be called when a new performance state needs to be * requested for a device that is attached to a PM domain. Note that, the * support for performance scaling for PM domains is optional. * * Returns 0 on success and when performance scaling isn't supported, negative * error code on failure. */ int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state) { if (dev->pm_domain && dev->pm_domain->set_performance_state) return dev->pm_domain->set_performance_state(dev, state); return 0; } EXPORT_SYMBOL_GPL(dev_pm_domain_set_performance_state);
75 75 75 75 28 5 5 3 2 2 2 2 3 5 1 1 1 1 56 22 2 2 2 2 2 2 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 /* * videobuf2-vmalloc.c - vmalloc memory allocator for videobuf2 * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #include <linux/io.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <media/videobuf2-v4l2.h> #include <media/videobuf2-vmalloc.h> #include <media/videobuf2-memops.h> struct vb2_vmalloc_buf { void *vaddr; struct frame_vector *vec; enum dma_data_direction dma_dir; unsigned long size; refcount_t refcount; struct vb2_vmarea_handler handler; struct dma_buf *dbuf; }; static void vb2_vmalloc_put(void *buf_priv); static void *vb2_vmalloc_alloc(struct vb2_buffer *vb, struct device *dev, unsigned long size) { struct vb2_vmalloc_buf *buf; buf = kzalloc(sizeof(*buf), GFP_KERNEL | vb->vb2_queue->gfp_flags); if (!buf) return ERR_PTR(-ENOMEM); buf->size = size; buf->vaddr = vmalloc_user(buf->size); if (!buf->vaddr) { pr_debug("vmalloc of size %ld failed\n", buf->size); kfree(buf); return ERR_PTR(-ENOMEM); } buf->dma_dir = vb->vb2_queue->dma_dir; buf->handler.refcount = &buf->refcount; buf->handler.put = vb2_vmalloc_put; buf->handler.arg = buf; refcount_set(&buf->refcount, 1); return buf; } static void vb2_vmalloc_put(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; if (refcount_dec_and_test(&buf->refcount)) { vfree(buf->vaddr); kfree(buf); } } static void *vb2_vmalloc_get_userptr(struct vb2_buffer *vb, struct device *dev, unsigned long vaddr, unsigned long size) { struct vb2_vmalloc_buf *buf; struct frame_vector *vec; int n_pages, offset, i; int ret = -ENOMEM; buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); buf->dma_dir = vb->vb2_queue->dma_dir; offset = vaddr & ~PAGE_MASK; buf->size = size; vec = vb2_create_framevec(vaddr, size, buf->dma_dir == DMA_FROM_DEVICE || buf->dma_dir == DMA_BIDIRECTIONAL); if (IS_ERR(vec)) { ret = PTR_ERR(vec); goto fail_pfnvec_create; } buf->vec = vec; n_pages = frame_vector_count(vec); if (frame_vector_to_pages(vec) < 0) { unsigned long *nums = frame_vector_pfns(vec); /* * We cannot get page pointers for these pfns. Check memory is * physically contiguous and use direct mapping. */ for (i = 1; i < n_pages; i++) if (nums[i-1] + 1 != nums[i]) goto fail_map; buf->vaddr = (__force void *) ioremap(__pfn_to_phys(nums[0]), size + offset); } else { buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1); } if (!buf->vaddr) goto fail_map; buf->vaddr += offset; return buf; fail_map: vb2_destroy_framevec(vec); fail_pfnvec_create: kfree(buf); return ERR_PTR(ret); } static void vb2_vmalloc_put_userptr(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; unsigned long vaddr = (unsigned long)buf->vaddr & PAGE_MASK; unsigned int i; struct page **pages; unsigned int n_pages; if (!buf->vec->is_pfns) { n_pages = frame_vector_count(buf->vec); if (vaddr) vm_unmap_ram((void *)vaddr, n_pages); if (buf->dma_dir == DMA_FROM_DEVICE || buf->dma_dir == DMA_BIDIRECTIONAL) { pages = frame_vector_pages(buf->vec); if (!WARN_ON_ONCE(IS_ERR(pages))) for (i = 0; i < n_pages; i++) set_page_dirty_lock(pages[i]); } } else { iounmap((__force void __iomem *)buf->vaddr); } vb2_destroy_framevec(buf->vec); kfree(buf); } static void *vb2_vmalloc_vaddr(struct vb2_buffer *vb, void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; if (!buf->vaddr) { pr_err("Address of an unallocated plane requested or cannot map user pointer\n"); return NULL; } return buf->vaddr; } static unsigned int vb2_vmalloc_num_users(void *buf_priv) { struct vb2_vmalloc_buf *buf = buf_priv; return refcount_read(&buf->refcount); } static int vb2_vmalloc_mmap(void *buf_priv, struct vm_area_struct *vma) { struct vb2_vmalloc_buf *buf = buf_priv; int ret; if (!buf) { pr_err("No memory to map\n"); return -EINVAL; } ret = remap_vmalloc_range(vma, buf->vaddr, 0); if (ret) { pr_err("Remapping vmalloc memory, error: %d\n", ret); return ret; } /* * Make sure that vm_areas for 2 buffers won't be merged together */ vm_flags_set(vma, VM_DONTEXPAND); /* * Use common vm_area operations to track buffer refcount. */ vma->vm_private_data = &buf->handler; vma->vm_ops = &vb2_common_vm_ops; vma->vm_ops->open(vma); return 0; } #ifdef CONFIG_HAS_DMA /*********************************************/ /* DMABUF ops for exporters */ /*********************************************/ struct vb2_vmalloc_attachment { struct sg_table sgt; enum dma_data_direction dma_dir; }; static int vb2_vmalloc_dmabuf_ops_attach(struct dma_buf *dbuf, struct dma_buf_attachment *dbuf_attach) { struct vb2_vmalloc_attachment *attach; struct vb2_vmalloc_buf *buf = dbuf->priv; int num_pages = PAGE_ALIGN(buf->size) / PAGE_SIZE; struct sg_table *sgt; struct scatterlist *sg; void *vaddr = buf->vaddr; int ret; int i; attach = kzalloc(sizeof(*attach), GFP_KERNEL); if (!attach) return -ENOMEM; sgt = &attach->sgt; ret = sg_alloc_table(sgt, num_pages, GFP_KERNEL); if (ret) { kfree(attach); return ret; } for_each_sgtable_sg(sgt, sg, i) { struct page *page = vmalloc_to_page(vaddr); if (!page) { sg_free_table(sgt); kfree(attach); return -ENOMEM; } sg_set_page(sg, page, PAGE_SIZE, 0); vaddr += PAGE_SIZE; } attach->dma_dir = DMA_NONE; dbuf_attach->priv = attach; return 0; } static void vb2_vmalloc_dmabuf_ops_detach(struct dma_buf *dbuf, struct dma_buf_attachment *db_attach) { struct vb2_vmalloc_attachment *attach = db_attach->priv; struct sg_table *sgt; if (!attach) return; sgt = &attach->sgt; /* release the scatterlist cache */ if (attach->dma_dir != DMA_NONE) dma_unmap_sgtable(db_attach->dev, sgt, attach->dma_dir, 0); sg_free_table(sgt); kfree(attach); db_attach->priv = NULL; } static struct sg_table *vb2_vmalloc_dmabuf_ops_map( struct dma_buf_attachment *db_attach, enum dma_data_direction dma_dir) { struct vb2_vmalloc_attachment *attach = db_attach->priv; struct sg_table *sgt; sgt = &attach->sgt; /* return previously mapped sg table */ if (attach->dma_dir == dma_dir) return sgt; /* release any previous cache */ if (attach->dma_dir != DMA_NONE) { dma_unmap_sgtable(db_attach->dev, sgt, attach->dma_dir, 0); attach->dma_dir = DMA_NONE; } /* mapping to the client with new direction */ if (dma_map_sgtable(db_attach->dev, sgt, dma_dir, 0)) { pr_err("failed to map scatterlist\n"); return ERR_PTR(-EIO); } attach->dma_dir = dma_dir; return sgt; } static void vb2_vmalloc_dmabuf_ops_unmap(struct dma_buf_attachment *db_attach, struct sg_table *sgt, enum dma_data_direction dma_dir) { /* nothing to be done here */ } static void vb2_vmalloc_dmabuf_ops_release(struct dma_buf *dbuf) { /* drop reference obtained in vb2_vmalloc_get_dmabuf */ vb2_vmalloc_put(dbuf->priv); } static int vb2_vmalloc_dmabuf_ops_vmap(struct dma_buf *dbuf, struct iosys_map *map) { struct vb2_vmalloc_buf *buf = dbuf->priv; iosys_map_set_vaddr(map, buf->vaddr); return 0; } static int vb2_vmalloc_dmabuf_ops_mmap(struct dma_buf *dbuf, struct vm_area_struct *vma) { return vb2_vmalloc_mmap(dbuf->priv, vma); } static const struct dma_buf_ops vb2_vmalloc_dmabuf_ops = { .attach = vb2_vmalloc_dmabuf_ops_attach, .detach = vb2_vmalloc_dmabuf_ops_detach, .map_dma_buf = vb2_vmalloc_dmabuf_ops_map, .unmap_dma_buf = vb2_vmalloc_dmabuf_ops_unmap, .vmap = vb2_vmalloc_dmabuf_ops_vmap, .mmap = vb2_vmalloc_dmabuf_ops_mmap, .release = vb2_vmalloc_dmabuf_ops_release, }; static struct dma_buf *vb2_vmalloc_get_dmabuf(struct vb2_buffer *vb, void *buf_priv, unsigned long flags) { struct vb2_vmalloc_buf *buf = buf_priv; struct dma_buf *dbuf; DEFINE_DMA_BUF_EXPORT_INFO(exp_info); exp_info.ops = &vb2_vmalloc_dmabuf_ops; exp_info.size = buf->size; exp_info.flags = flags; exp_info.priv = buf; if (WARN_ON(!buf->vaddr)) return NULL; dbuf = dma_buf_export(&exp_info); if (IS_ERR(dbuf)) return NULL; /* dmabuf keeps reference to vb2 buffer */ refcount_inc(&buf->refcount); return dbuf; } #endif /* CONFIG_HAS_DMA */ /*********************************************/ /* callbacks for DMABUF buffers */ /*********************************************/ static int vb2_vmalloc_map_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map; int ret; ret = dma_buf_vmap_unlocked(buf->dbuf, &map); if (ret) return -EFAULT; buf->vaddr = map.vaddr; return 0; } static void vb2_vmalloc_unmap_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map = IOSYS_MAP_INIT_VADDR(buf->vaddr); dma_buf_vunmap_unlocked(buf->dbuf, &map); buf->vaddr = NULL; } static void vb2_vmalloc_detach_dmabuf(void *mem_priv) { struct vb2_vmalloc_buf *buf = mem_priv; struct iosys_map map = IOSYS_MAP_INIT_VADDR(buf->vaddr); if (buf->vaddr) dma_buf_vunmap_unlocked(buf->dbuf, &map); kfree(buf); } static void *vb2_vmalloc_attach_dmabuf(struct vb2_buffer *vb, struct device *dev, struct dma_buf *dbuf, unsigned long size) { struct vb2_vmalloc_buf *buf; if (dbuf->size < size) return ERR_PTR(-EFAULT); buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); buf->dbuf = dbuf; buf->dma_dir = vb->vb2_queue->dma_dir; buf->size = size; return buf; } const struct vb2_mem_ops vb2_vmalloc_memops = { .alloc = vb2_vmalloc_alloc, .put = vb2_vmalloc_put, .get_userptr = vb2_vmalloc_get_userptr, .put_userptr = vb2_vmalloc_put_userptr, #ifdef CONFIG_HAS_DMA .get_dmabuf = vb2_vmalloc_get_dmabuf, #endif .map_dmabuf = vb2_vmalloc_map_dmabuf, .unmap_dmabuf = vb2_vmalloc_unmap_dmabuf, .attach_dmabuf = vb2_vmalloc_attach_dmabuf, .detach_dmabuf = vb2_vmalloc_detach_dmabuf, .vaddr = vb2_vmalloc_vaddr, .mmap = vb2_vmalloc_mmap, .num_users = vb2_vmalloc_num_users, }; EXPORT_SYMBOL_GPL(vb2_vmalloc_memops); MODULE_DESCRIPTION("vmalloc memory handling routines for videobuf2"); MODULE_AUTHOR("Pawel Osciak <pawel@osciak.com>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("DMA_BUF");
315 46 14 237 32 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions implement the SCTP primitive functions from Section 10. * * Note that the descriptions from the specification are USER level * functions--this file is the functions which populate the struct proto * for SCTP which is the BOTTOM of the sockets interface. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Narasimha Budihal <narasimha@refcode.org> * Karl Knutson <karl@athena.chicago.il.us> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/time.h> /* For struct timeval */ #include <linux/gfp.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #define DECLARE_PRIMITIVE(name) \ /* This is called in the code as sctp_primitive_ ## name. */ \ int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \ void *arg) { \ int error = 0; \ enum sctp_event_type event_type; union sctp_subtype subtype; \ enum sctp_state state; \ struct sctp_endpoint *ep; \ \ event_type = SCTP_EVENT_T_PRIMITIVE; \ subtype = SCTP_ST_PRIMITIVE(SCTP_PRIMITIVE_ ## name); \ state = asoc ? asoc->state : SCTP_STATE_CLOSED; \ ep = asoc ? asoc->ep : NULL; \ \ error = sctp_do_sm(net, event_type, subtype, state, ep, asoc, \ arg, GFP_KERNEL); \ return error; \ } /* 10.1 ULP-to-SCTP * B) Associate * * Format: ASSOCIATE(local SCTP instance name, destination transport addr, * outbound stream count) * -> association id [,destination transport addr list] [,outbound stream * count] * * This primitive allows the upper layer to initiate an association to a * specific peer endpoint. * * This version assumes that asoc is fully populated with the initial * parameters. We then return a traditional kernel indicator of * success or failure. */ /* This is called in the code as sctp_primitive_ASSOCIATE. */ DECLARE_PRIMITIVE(ASSOCIATE) /* 10.1 ULP-to-SCTP * C) Shutdown * * Format: SHUTDOWN(association id) * -> result * * Gracefully closes an association. Any locally queued user data * will be delivered to the peer. The association will be terminated only * after the peer acknowledges all the SCTP packets sent. A success code * will be returned on successful termination of the association. If * attempting to terminate the association results in a failure, an error * code shall be returned. */ DECLARE_PRIMITIVE(SHUTDOWN); /* 10.1 ULP-to-SCTP * C) Abort * * Format: Abort(association id [, cause code]) * -> result * * Ungracefully closes an association. Any locally queued user data * will be discarded and an ABORT chunk is sent to the peer. A success * code will be returned on successful abortion of the association. If * attempting to abort the association results in a failure, an error * code shall be returned. */ DECLARE_PRIMITIVE(ABORT); /* 10.1 ULP-to-SCTP * E) Send * * Format: SEND(association id, buffer address, byte count [,context] * [,stream id] [,life time] [,destination transport address] * [,unorder flag] [,no-bundle flag] [,payload protocol-id] ) * -> result * * This is the main method to send user data via SCTP. * * Mandatory attributes: * * o association id - local handle to the SCTP association * * o buffer address - the location where the user message to be * transmitted is stored; * * o byte count - The size of the user data in number of bytes; * * Optional attributes: * * o context - an optional 32 bit integer that will be carried in the * sending failure notification to the ULP if the transportation of * this User Message fails. * * o stream id - to indicate which stream to send the data on. If not * specified, stream 0 will be used. * * o life time - specifies the life time of the user data. The user data * will not be sent by SCTP after the life time expires. This * parameter can be used to avoid efforts to transmit stale * user messages. SCTP notifies the ULP if the data cannot be * initiated to transport (i.e. sent to the destination via SCTP's * send primitive) within the life time variable. However, the * user data will be transmitted if SCTP has attempted to transmit a * chunk before the life time expired. * * o destination transport address - specified as one of the destination * transport addresses of the peer endpoint to which this packet * should be sent. Whenever possible, SCTP should use this destination * transport address for sending the packets, instead of the current * primary path. * * o unorder flag - this flag, if present, indicates that the user * would like the data delivered in an unordered fashion to the peer * (i.e., the U flag is set to 1 on all DATA chunks carrying this * message). * * o no-bundle flag - instructs SCTP not to bundle this user data with * other outbound DATA chunks. SCTP MAY still bundle even when * this flag is present, when faced with network congestion. * * o payload protocol-id - A 32 bit unsigned integer that is to be * passed to the peer indicating the type of payload protocol data * being transmitted. This value is passed as opaque data by SCTP. */ DECLARE_PRIMITIVE(SEND); /* 10.1 ULP-to-SCTP * J) Request Heartbeat * * Format: REQUESTHEARTBEAT(association id, destination transport address) * * -> result * * Instructs the local endpoint to perform a HeartBeat on the specified * destination transport address of the given association. The returned * result should indicate whether the transmission of the HEARTBEAT * chunk to the destination address is successful. * * Mandatory attributes: * * o association id - local handle to the SCTP association * * o destination transport address - the transport address of the * association on which a heartbeat should be issued. */ DECLARE_PRIMITIVE(REQUESTHEARTBEAT); /* ADDIP * 3.1.1 Address Configuration Change Chunk (ASCONF) * * This chunk is used to communicate to the remote endpoint one of the * configuration change requests that MUST be acknowledged. The * information carried in the ASCONF Chunk uses the form of a * Type-Length-Value (TLV), as described in "3.2.1 Optional/ * Variable-length Parameter Format" in RFC2960 [5], forall variable * parameters. */ DECLARE_PRIMITIVE(ASCONF); /* RE-CONFIG 5.1 */ DECLARE_PRIMITIVE(RECONF);
165 28 4970 3 98 8 2 305 142 141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H #include "elevator.h" #include "blk-mq.h" #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) __blk_mq_sched_restart(hctx); } static inline bool bio_mergeable(struct bio *bio) { return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = rq->q->elevator; if (e->type->ops.completed_request) e->type->ops.completed_request(rq, now); } } static inline void blk_mq_sched_requeue_request(struct request *rq) { if (rq->rq_flags & RQF_USE_SCHED) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if (e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; if (e && e->type->ops.has_work) return e->type->ops.has_work(hctx); return false; } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } #endif
31 85 1 12 25 25 19 104 36 99 24 24 24 13 14 2 1 15 36 36 36 36 3 3 3 33 33 36 36 16 14 13 14 14 14 14 14 12 5 7 87 65 67 67 37 36 67 66 37 37 36 1 36 67 1 67 6 65 66 12 61 66 37 89 16 18 15 2 12 16 1 45 16 23 19 64 2 5 68 13 68 15 67 5 5 5 1 1 1 112 124 44 64 64 122 110 122 2 2 2 1 2 2 2 2 30 9 32 30 3 3 3 3 41 9 2 2 1 3 3 3 2 2 2 3 3 110 110 110 146 31 29 30 135 95 83 42 95 84 24 3 60 59 55 55 9 6 4 58 60 59 60 60 59 9 60 3 3 3 3 1 1 2 1 2 14 14 22 22 22 22 14 13 3 11 7 5 6 5 4 1 1 22 21 21 14 11 11 6 22 24 24 7 15 15 10 1 10 4 5 3 10 3 3 8 8 3 24 3 4 4 4 3 3 3 3 4 1 1 1 1 2 1 1 1 3 2 1 1 2 2 2 2 2 2 2 2 1 1 1 9 9 16 5 3 3 1 1 27 27 64 8 1 5 8 5 13 1 12 12 12 3 3 3 3 13 467 100 13 13 3 10 10 6 13 13 11 13 13 13 1 2 26 11 16 14 16 16 3 14 9 9 5 16 6 25 5 6 6 13 12 7 6 12 578 99 15 15 15 23 23 23 21 14 12 12 6 14 15 15 3 15 7 13 2 15 24 18 23 12 24 6 6 6 5 5 6 3 3 3 3 3 3 25 18 4 64 5 4 1 8 8 4 4 4 8 1 1 2 1 1 1 1 24 13 13 1 13 12 12 13 13 2 2 3 3 53 114 268 227 43 44 3 44 44 3 44 41 108 18 23 109 18 2 18 45 26 64 42 40 1 39 42 261 257 257 4 45 45 45 45 37 37 45 3 44 45 45 45 45 45 45 45 95 95 14 10 10 10 9 10 258 218 258 42 42 42 42 42 42 42 42 450 451 95 450 113 8 109 103 10 10 9 2 2 2 2 2 2 2 28 28 2 2 27 27 27 26 27 27 27 27 26 25 26 27 423 85 85 2 11 11 11 9 17 17 5 5 5 5 17 6 5 2 1 3 6 2 2 2 1 2 11 2 9 11 5 5 5 1 1 11 11 5 2 3 5 4 2 3 3 4 464 464 466 8 2 1 8 2 2 7 4 3 1 7 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 // SPDX-License-Identifier: GPL-2.0-only /* * Local APIC virtualization * * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Authors: * Dor Laor <dor.laor@qumranet.com> * Gregory Haskins <ghaskins@novell.com> * Yaozu (Eddie) Dong <eddie.dong@intel.com> * * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/smp.h> #include <linux/hrtimer.h> #include <linux/io.h> #include <linux/export.h> #include <linux/math64.h> #include <linux/slab.h> #include <asm/processor.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/page.h> #include <asm/current.h> #include <asm/apicdef.h> #include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include "kvm_cache_regs.h" #include "irq.h" #include "ioapic.h" #include "trace.h" #include "x86.h" #include "xen.h" #include "cpuid.h" #include "hyperv.h" #include "smm.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) #else #define mod_64(x, y) ((x) % (y)) #endif /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 /* * Enable local APIC timer advancement (tscdeadline mode only) with adaptive * tuning. When enabled, KVM programs the host timer event to fire early, i.e. * before the deadline expires, to account for the delay between taking the * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume * the guest, i.e. so that the interrupt arrives in the guest with minimal * latency relative to the deadline programmed by the guest. */ static bool lapic_timer_advance __read_mostly = true; module_param(lapic_timer_advance, bool, 0444); #define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_NS_INIT 1000 #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) { *((u32 *) (regs + reg_off)) = val; } static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) { __kvm_lapic_set_reg(apic->regs, reg_off, val); } static __always_inline u64 __kvm_lapic_get_reg64(char *regs, int reg) { BUILD_BUG_ON(reg != APIC_ICR); return *((u64 *) (regs + reg)); } static __always_inline u64 kvm_lapic_get_reg64(struct kvm_lapic *apic, int reg) { return __kvm_lapic_get_reg64(apic->regs, reg); } static __always_inline void __kvm_lapic_set_reg64(char *regs, int reg, u64 val) { BUILD_BUG_ON(reg != APIC_ICR); *((u64 *) (regs + reg)) = val; } static __always_inline void kvm_lapic_set_reg64(struct kvm_lapic *apic, int reg, u64 val) { __kvm_lapic_set_reg64(apic->regs, reg, val); } static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector) { struct kvm_lapic *apic = vcpu->arch.apic; return apic_test_vector(vector, apic->regs + APIC_ISR) || apic_test_vector(vector, apic->regs + APIC_IRR); } static inline int __apic_test_and_set_vector(int vec, void *bitmap) { return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } static inline int __apic_test_and_clear_vector(int vec, void *bitmap) { return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); static inline int apic_enabled(struct kvm_lapic *apic) { return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic); } #define LVT_MASK \ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) #define LINT_MASK \ (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) { return apic->vcpu->vcpu_id; } static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) { return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) && (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); } bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) { return kvm_x86_ops.set_hv_timer && !(kvm_mwait_in_guest(vcpu->kvm) || kvm_can_post_timer_interrupt(vcpu)); } static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) { return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE; } static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) { return ((id >> 4) << 16) | (1 << (id & 0xf)); } static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->logical_mode) { case KVM_APIC_MODE_SW_DISABLED: /* Arbitrarily use the flat map so that @cluster isn't NULL. */ *cluster = map->xapic_flat_map; *mask = 0; return true; case KVM_APIC_MODE_X2APIC: { u32 offset = (dest_id >> 16) * 16; u32 max_apic_id = map->max_apic_id; if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); offset = array_index_nospec(offset, map->max_apic_id + 1); *cluster = &map->phys_map[offset]; *mask = dest_id & (0xffff >> (16 - cluster_size)); } else { *mask = 0; } return true; } case KVM_APIC_MODE_XAPIC_FLAT: *cluster = map->xapic_flat_map; *mask = dest_id & 0xff; return true; case KVM_APIC_MODE_XAPIC_CLUSTER: *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf]; *mask = dest_id & 0xf; return true; case KVM_APIC_MODE_MAP_DISABLED: return false; default: WARN_ON_ONCE(1); return false; } } static void kvm_apic_map_free(struct rcu_head *rcu) { struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); kvfree(map); } static int kvm_recalculate_phys_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu, bool *xapic_id_mismatch) { struct kvm_lapic *apic = vcpu->arch.apic; u32 x2apic_id = kvm_x2apic_id(apic); u32 xapic_id = kvm_xapic_id(apic); u32 physical_id; /* * For simplicity, KVM always allocates enough space for all possible * xAPIC IDs. Yell, but don't kill the VM, as KVM can continue on * without the optimized map. */ if (WARN_ON_ONCE(xapic_id > new->max_apic_id)) return -EINVAL; /* * Bail if a vCPU was added and/or enabled its APIC between allocating * the map and doing the actual calculations for the map. Note, KVM * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if * the compiler decides to reload x2apic_id after this check. */ if (x2apic_id > new->max_apic_id) return -E2BIG; /* * Deliberately truncate the vCPU ID when detecting a mismatched APIC * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a * 32-bit value. Any unwanted aliasing due to truncation results will * be detected below. */ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id) *xapic_id_mismatch = true; /* * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs. * Allow sending events to vCPUs by their x2APIC ID even if the target * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap * and collide). * * Honor the architectural (and KVM's non-optimized) behavior if * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed * to process messages independently. If multiple vCPUs have the same * effective APIC ID, e.g. due to the x2APIC wrap or because the guest * manually modified its xAPIC IDs, events targeting that ID are * supposed to be recognized by all vCPUs with said ID. */ if (vcpu->kvm->arch.x2apic_format) { /* See also kvm_apic_match_physical_addr(). */ if (apic_x2apic_mode(apic) || x2apic_id > 0xff) new->phys_map[x2apic_id] = apic; if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) new->phys_map[xapic_id] = apic; } else { /* * Disable the optimized map if the physical APIC ID is already * mapped, i.e. is aliased to multiple vCPUs. The optimized * map requires a strict 1:1 mapping between IDs and vCPUs. */ if (apic_x2apic_mode(apic)) physical_id = x2apic_id; else physical_id = xapic_id; if (new->phys_map[physical_id]) return -EINVAL; new->phys_map[physical_id] = apic; } return 0; } static void kvm_recalculate_logical_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; enum kvm_apic_logical_mode logical_mode; struct kvm_lapic **cluster; u16 mask; u32 ldr; if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED) return; if (!kvm_apic_sw_enabled(apic)) return; ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (!ldr) return; if (apic_x2apic_mode(apic)) { logical_mode = KVM_APIC_MODE_X2APIC; } else { ldr = GET_APIC_LOGICAL_ID(ldr); if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT) logical_mode = KVM_APIC_MODE_XAPIC_FLAT; else logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER; } /* * To optimize logical mode delivery, all software-enabled APICs must * be configured for the same mode. */ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) { new->logical_mode = logical_mode; } else if (new->logical_mode != logical_mode) { new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; return; } /* * In x2APIC mode, the LDR is read-only and derived directly from the * x2APIC ID, thus is guaranteed to be addressable. KVM reuses * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by * reversing the LDR calculation to get cluster of APICs, i.e. no * additional work is required. */ if (apic_x2apic_mode(apic)) return; if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))) { new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; return; } if (!mask) return; ldr = ffs(mask) - 1; if (!is_power_of_2(mask) || cluster[ldr]) new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; else cluster[ldr] = apic; } /* * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock. * * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with * apic_map_lock_held. */ enum { CLEAN, UPDATE_IN_PROGRESS, DIRTY }; static void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ bool xapic_id_mismatch; int r; /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; WARN_ONCE(!irqchip_in_kernel(kvm), "Dirty APIC map without an in-kernel local APIC"); mutex_lock(&kvm->arch.apic_map_lock); retry: /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean) * or the APIC registers (if dirty). Note, on retry the map may have * not yet been marked dirty by whatever task changed a vCPU's x2APIC * ID, i.e. the map may still show up as in-progress. In that case * this task still needs to retry and complete its calculation. */ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty, DIRTY, UPDATE_IN_PROGRESS) == CLEAN) { /* Someone else has updated the map. */ mutex_unlock(&kvm->arch.apic_map_lock); return; } /* * Reset the mismatch flag between attempts so that KVM does the right * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e. * keep max_id strictly increasing. Disallowing max_id from shrinking * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU * with the highest x2APIC ID is toggling its APIC on and off. */ xapic_id_mismatch = false; kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvzalloc(sizeof(struct kvm_apic_map) + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL_ACCOUNT); if (!new) goto out; new->max_apic_id = max_id; new->logical_mode = KVM_APIC_MODE_SW_DISABLED; kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue; r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch); if (r) { kvfree(new); new = NULL; if (r == -E2BIG) { cond_resched(); goto retry; } goto out; } kvm_recalculate_logical_map(new, vcpu); } out: /* * The optimized map is effectively KVM's internal version of APICv, * and all unwanted aliasing that results in disabling the optimized * map also applies to APICv. */ if (!new) kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); else kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED) kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); else kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED); if (xapic_id_mismatch) kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); else kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); old = rcu_dereference_protected(kvm->arch.apic_map, lockdep_is_held(&kvm->arch.apic_map_lock)); rcu_assign_pointer(kvm->arch.apic_map, new); /* * Write kvm->arch.apic_map before clearing apic->apic_map_dirty. * If another update has come in, leave it DIRTY. */ atomic_cmpxchg_release(&kvm->arch.apic_map_dirty, UPDATE_IN_PROGRESS, CLEAN); mutex_unlock(&kvm->arch.apic_map_lock); if (old) call_rcu(&old->rcu, kvm_apic_map_free); kvm_make_scan_ioapic_request(kvm); } static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) { bool enabled = val & APIC_SPIV_APIC_ENABLED; kvm_lapic_set_reg(apic, APIC_SPIV, val); if (enabled != apic->sw_enabled) { apic->sw_enabled = enabled; if (enabled) static_branch_slow_dec_deferred(&apic_sw_disabled); else static_branch_inc(&apic_sw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } /* Check if there are APF page ready requests pending */ if (enabled) { kvm_make_request(KVM_REQ_APF_READY, apic->vcpu); kvm_xen_sw_enable_lapic(apic->vcpu); } } static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) { kvm_lapic_set_reg(apic, APIC_LDR, id); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) { kvm_lapic_set_reg(apic, APIC_DFR, val); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = kvm_apic_calc_x2apic_ldr(id); WARN_ON_ONCE(id != apic->vcpu->vcpu_id); kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); } static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) { return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT; } static inline int apic_lvtt_period(struct kvm_lapic *apic) { return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC; } static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) { return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE; } static inline int apic_lvt_nmi_mode(u32 lvt_val) { return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index) { return apic->nr_lvt_entries > lvt_index; } static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu) { return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P); } void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u32 v = 0; if (!lapic_in_kernel(vcpu)) return; v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); /* * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) * which doesn't have EOI register; Some buggy OSes (e.g. Windows with * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC * version first and level-triggered interrupts never get EOIed in * IOAPIC. */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) && !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); } void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu) { int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; int i; if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) return; /* Initialize/mask any "new" LVT entries. */ for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++) kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic->nr_lvt_entries = nr_lvt_entries; /* The number of LVT entries is reflected in the version register. */ kvm_apic_set_version(vcpu); } static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK, [LVT_LINT0] = LINT_MASK, [LVT_LINT1] = LINT_MASK, [LVT_ERROR] = LVT_MASK, [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK }; static int find_highest_vector(void *bitmap) { int vec; u32 *reg; for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG; vec >= 0; vec -= APIC_VECTORS_PER_REG) { reg = bitmap + REG_POS(vec); if (*reg) return __fls(*reg) + vec; } return -1; } static u8 count_vectors(void *bitmap) { int vec; u32 *reg; u8 count = 0; for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) { reg = bitmap + REG_POS(vec); count += hweight32(*reg); } return count; } bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) { u32 i, vec; u32 pir_val, irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); irr_val = *p_irr; pir_val = READ_ONCE(pir[i]); if (pir_val) { pir_val = xchg(&pir[i], 0); prev_irr_val = irr_val; do { irr_val = prev_irr_val | pir_val; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); if (prev_irr_val != irr_val) max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec; } if (irr_val) *max_irr = __fls(irr_val) + vec; } return ((max_updated_irr != -1) && (max_updated_irr == *max_irr)); } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); if (unlikely(!apic->apicv_active && irr_updated)) apic->irr_pending = true; return irr_updated; } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); static inline int apic_search_irr(struct kvm_lapic *apic) { return find_highest_vector(apic->regs + APIC_IRR); } static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; /* * Note that irr_pending is just a hint. It will be always * true with virtual interrupt delivery enabled. */ if (!apic->irr_pending) return -1; result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); return result; } static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { if (unlikely(apic->apicv_active)) { kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); if (apic_search_irr(apic) != -1) apic->irr_pending = true; } } void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec) { apic_clear_irr(vec, vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) return; /* * With APIC virtualization enabled, all caching is disabled * because the processor can modify ISR under the hood. Instead * just set SVI. */ if (unlikely(apic->apicv_active)) kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); /* * ISR (in service register) bit is set when injecting an interrupt. * The highest vector is injected. Thus the latest bit set matches * the highest bit in ISR. */ apic->highest_isr_cache = vec; } } static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; /* * Note that isr_count is always 1, and highest_isr_cache * is always -1, with APIC virtualization enabled. */ if (!apic->isr_count) return -1; if (likely(apic->highest_isr_cache != -1)) return apic->highest_isr_cache; result = find_highest_vector(apic->regs + APIC_ISR); ASSERT(result == -1 || result >= 16); return result; } static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) return; /* * We do get here for APIC virtualization enabled if the guest * uses the Hyper-V APIC enlightenment. In this case we may need * to trigger a new interrupt delivery by writing the SVI field; * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ if (unlikely(apic->apicv_active)) kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); apic->highest_isr_cache = -1; } } void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active) return; kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { /* This may race with setting of irr in __apic_accept_irq() and * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq * will cause vmexit immediately and the value will be recalculated * on the next vmentry. */ return apic_find_highest_irr(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, struct dest_map *dest_map); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map) { struct kvm_lapic *apic = vcpu->arch.apic; return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, irq->level, irq->trig_mode, dest_map); } static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map, struct kvm_lapic_irq *irq, u32 min) { int i, count = 0; struct kvm_vcpu *vcpu; if (min > map->max_apic_id) return 0; for_each_set_bit(i, ipi_bitmap, min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { if (map->phys_map[min + i]) { vcpu = map->phys_map[min + i]->vcpu; count += kvm_apic_set_irq(vcpu, irq, NULL); } } return count; } int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { struct kvm_apic_map *map; struct kvm_lapic_irq irq = {0}; int cluster_size = op_64_bit ? 64 : 32; int count; if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK)) return -KVM_EINVAL; irq.vector = icr & APIC_VECTOR_MASK; irq.delivery_mode = icr & APIC_MODE_MASK; irq.level = (icr & APIC_INT_ASSERT) != 0; irq.trig_mode = icr & APIC_INT_LEVELTRIG; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); count = -EOPNOTSUPP; if (likely(map)) { count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min); min += cluster_size; count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min); } rcu_read_unlock(); return count; } static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) return; __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu) { u8 val; if (pv_eoi_get_user(vcpu, &val) < 0) return false; val &= KVM_PV_EOI_ENABLED; if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) return false; /* * Clear pending bit in any case: it will be set again on vmentry. * While this might not be ideal from performance point of view, * this makes sure pv eoi is only enabled when we know it's safe. */ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); return val; } static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; if (kvm_x86_ops.sync_pir_to_irr) highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) return -1; return highest_irr; } static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) { u32 tpr, isrv, ppr, old_ppr; int isr; old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI); tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; if ((tpr & 0xf0) >= (isrv & 0xf0)) ppr = tpr & 0xff; else ppr = isrv & 0xf0; *new_ppr = ppr; if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); return ppr < old_ppr; } static void apic_update_ppr(struct kvm_lapic *apic) { u32 ppr; if (__apic_update_ppr(apic, &ppr) && apic_has_interrupt_for_ppr(apic, ppr) != -1) kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) { apic_update_ppr(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_apic_update_ppr); static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr); apic_update_ppr(apic); } static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) { return mda == (apic_x2apic_mode(apic) ? X2APIC_BROADCAST : APIC_BROADCAST); } static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) { if (kvm_apic_broadcast(apic, mda)) return true; /* * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they * were in x2APIC mode if the target APIC ID can't be encoded as an * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC * mode. Match the x2APIC ID if and only if the target APIC ID can't * be encoded in xAPIC to avoid spurious matches against a vCPU that * changed its (addressable) xAPIC ID (which is writable). */ if (apic_x2apic_mode(apic) || mda > 0xff) return mda == kvm_x2apic_id(apic); return mda == kvm_xapic_id(apic); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) { u32 logical_id; if (kvm_apic_broadcast(apic, mda)) return true; logical_id = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) return ((logical_id >> 16) == (mda >> 16)) && (logical_id & mda & 0xffff) != 0; logical_id = GET_APIC_LOGICAL_ID(logical_id); switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: return (logical_id & mda) != 0; case APIC_DFR_CLUSTER: return ((logical_id >> 4) == (mda >> 4)) && (logical_id & mda & 0xf) != 0; default: return false; } } /* The KVM local APIC implementation has two quirks: * * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID. * KVM doesn't do that aliasing. * * - in-kernel IOAPIC messages have to be delivered directly to * x2APIC, because the kernel does not support interrupt remapping. * In order to support broadcast without interrupt remapping, x2APIC * rewrites the destination of non-IPI messages from APIC_BROADCAST * to X2APIC_BROADCAST. * * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is * important when userspace wants to use x2APIC-format MSIs, because * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target)) return X2APIC_BROADCAST; return dest_id; } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; u32 mda = kvm_apic_mda(vcpu, dest, source, target); ASSERT(target); switch (shorthand) { case APIC_DEST_NOSHORT: if (dest_mode == APIC_DEST_PHYSICAL) return kvm_apic_match_physical_addr(target, mda); else return kvm_apic_match_logical_addr(target, mda); case APIC_DEST_SELF: return target == source; case APIC_DEST_ALLINC: return true; case APIC_DEST_ALLBUT: return target != source; default: return false; } } EXPORT_SYMBOL_GPL(kvm_apic_match_dest); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size) { u32 mod; int i, idx = -1; mod = vector % dest_vcpus; for (i = 0; i <= mod; i++) { idx = find_next_bit(bitmap, bitmap_size, idx + 1); BUG_ON(idx == bitmap_size); } return idx; } static void kvm_apic_disabled_lapic_found(struct kvm *kvm) { if (!kvm->arch.disabled_lapic_found) { kvm->arch.disabled_lapic_found = true; pr_info("Disabled LAPIC found during irq injection\n"); } } static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, struct kvm_lapic_irq *irq, struct kvm_apic_map *map) { if (kvm->arch.x2apic_broadcast_quirk_disabled) { if ((irq->dest_id == APIC_BROADCAST && map->logical_mode != KVM_APIC_MODE_X2APIC)) return true; if (irq->dest_id == X2APIC_BROADCAST) return true; } else { bool x2apic_ipi = src && *src && apic_x2apic_mode(*src); if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) return true; } return false; } /* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. * Note: we may have zero kvm_lapic destinations when we return true, which * means that the interrupt should be dropped. In this case, *bitmap would be * zero and *dst undefined. */ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, struct kvm_lapic **src, struct kvm_lapic_irq *irq, struct kvm_apic_map *map, struct kvm_lapic ***dst, unsigned long *bitmap) { int i, lowest; if (irq->shorthand == APIC_DEST_SELF && src) { *dst = src; *bitmap = 1; return true; } else if (irq->shorthand) return false; if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map)) return false; if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1); *dst = &map->phys_map[dest_id]; *bitmap = 1; } return true; } *bitmap = 0; if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst, (u16 *)bitmap)) return false; if (!kvm_lowest_prio_delivery(irq)) return true; if (!kvm_vector_hashing_enabled()) { lowest = -1; for_each_set_bit(i, bitmap, 16) { if (!(*dst)[i]) continue; if (lowest < 0) lowest = i; else if (kvm_apic_compare_prio((*dst)[i]->vcpu, (*dst)[lowest]->vcpu) < 0) lowest = i; } } else { if (!*bitmap) return true; lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap), bitmap, 16); if (!(*dst)[lowest]) { kvm_apic_disabled_lapic_found(kvm); *bitmap = 0; return true; } } *bitmap = (lowest >= 0) ? 1 << lowest : 0; return true; } bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) { struct kvm_apic_map *map; unsigned long bitmap; struct kvm_lapic **dst = NULL; int i; bool ret; *r = -1; if (irq->shorthand == APIC_DEST_SELF) { if (KVM_BUG_ON(!src, kvm)) { *r = 0; return true; } *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); return true; } rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); if (ret) { *r = 0; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } } rcu_read_unlock(); return ret; } /* * This routine tries to handle interrupts in posted mode, here is how * it deals with different cases: * - For single-destination interrupts, handle it in posted mode * - Else if vector hashing is enabled and it is a lowest-priority * interrupt, handle it in posted mode and use the following mechanism * to find the destination vCPU. * 1. For lowest-priority interrupts, store all the possible * destination vCPUs in an array. * 2. Use "guest vector % max number of destination vCPUs" to find * the right destination vCPU in the array for the lowest-priority * interrupt. * - Otherwise, use remapped mode to inject the interrupt. */ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; unsigned long bitmap; struct kvm_lapic **dst = NULL; bool ret = false; if (irq->shorthand) return false; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && hweight16(bitmap) == 1) { unsigned long i = find_first_bit(&bitmap, 16); if (dst[i]) { *dest_vcpu = dst[i]->vcpu; ret = true; } } rcu_read_unlock(); return ret; } /* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded. */ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, struct dest_map *dest_map) { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); switch (delivery_mode) { case APIC_DM_LOWEST: vcpu->arch.apic_arb_prio++; fallthrough; case APIC_DM_FIXED: if (unlikely(trig_mode && !level)) break; /* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break; result = 1; if (dest_map) { __set_bit(vcpu->vcpu_id, dest_map->map); dest_map->vectors[vcpu->vcpu_id] = vector; } if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) kvm_lapic_set_vector(vector, apic->regs + APIC_TMR); else kvm_lapic_clear_vector(vector, apic->regs + APIC_TMR); } kvm_x86_call(deliver_interrupt)(apic, delivery_mode, trig_mode, vector); break; case APIC_DM_REMRD: result = 1; vcpu->arch.pv.pv_unhalted = 1; kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_SMI: if (!kvm_inject_smi(vcpu)) { kvm_vcpu_kick(vcpu); result = 1; } break; case APIC_DM_NMI: result = 1; kvm_inject_nmi(vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: if (!trig_mode || level) { result = 1; /* assumes that there are only KVM_APIC_INIT/SIPI */ apic->pending_events = (1UL << KVM_APIC_INIT); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; case APIC_DM_STARTUP: result = 1; apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ smp_wmb(); set_bit(KVM_APIC_SIPI, &apic->pending_events); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; case APIC_DM_EXTINT: /* * Should only be called by kvm_apic_local_deliver() with LVT0, * before NMI watchdog was enabled. Already handled by * kvm_apic_accept_pic_intr(). */ break; default: printk(KERN_ERR "TODO: unsupported delivery mode %x\n", delivery_mode); break; } return result; } /* * This routine identifies the destination vcpus mask meant to receive the * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find * out the destination vcpus array and set the bitmap or it traverses to * each available vcpu to identify the same. */ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, unsigned long *vcpu_bitmap) { struct kvm_lapic **dest_vcpu = NULL; struct kvm_lapic *src = NULL; struct kvm_apic_map *map; struct kvm_vcpu *vcpu; unsigned long bitmap, i; int vcpu_idx; bool ret; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu, &bitmap); if (ret) { for_each_set_bit(i, &bitmap, 16) { if (!dest_vcpu[i]) continue; vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx; __set_bit(vcpu_idx, vcpu_bitmap); } } else { kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand, irq->dest_id, irq->dest_mode)) continue; __set_bit(i, vcpu_bitmap); } } rcu_read_unlock(); } int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector) { return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors); } static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) { int trigger_mode; /* Eoi the ioapic only if the ioapic doesn't own the vector. */ if (!kvm_ioapic_handles_vector(apic, vector)) return; /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { apic->vcpu->arch.pending_ioapic_eoi = vector; kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); return; } if (apic_test_vector(vector, apic->regs + APIC_TMR)) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode); } static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); trace_kvm_eoi(apic, vector); /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) return vector; apic_clear_isr(vector, apic); apic_update_ppr(apic); if (kvm_hv_synic_has_vector(apic->vcpu, vector)) kvm_hv_synic_send_eoi(apic->vcpu, vector); kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); return vector; } /* * this interface assumes a trap-like exit, which has already finished * desired side effect including vISR and vPPR update. */ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) { struct kvm_lapic *apic = vcpu->arch.apic; trace_kvm_eoi(apic, vector); kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) { struct kvm_lapic_irq irq; /* KVM has no delay and should always clear the BUSY/PENDING flag. */ WARN_ON_ONCE(icr_low & APIC_ICR_BUSY); irq.vector = icr_low & APIC_VECTOR_MASK; irq.delivery_mode = icr_low & APIC_MODE_MASK; irq.dest_mode = icr_low & APIC_DEST_MASK; irq.level = (icr_low & APIC_INT_ASSERT) != 0; irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; irq.shorthand = icr_low & APIC_SHORT_MASK; irq.msi_redir_hint = false; if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); trace_kvm_apic_ipi(icr_low, irq.dest_id); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } EXPORT_SYMBOL_GPL(kvm_apic_send_ipi); static u32 apic_get_tmcct(struct kvm_lapic *apic) { ktime_t remaining, now; s64 ns; ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 || apic->lapic_timer.period == 0) return 0; now = ktime_get(); remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) remaining = 0; ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns * apic->divide_count)); } static void __report_tpr_access(struct kvm_lapic *apic, bool write) { struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_run *run = vcpu->run; kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } static inline void report_tpr_access(struct kvm_lapic *apic, bool write) { if (apic->vcpu->arch.tpr_access_reporting) __report_tpr_access(apic, write); } static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; if (offset >= LAPIC_MMIO_LENGTH) return 0; switch (offset) { case APIC_ARBPRI: break; case APIC_TMCCT: /* Timer CCR */ if (apic_lvtt_tscdeadline(apic)) return 0; val = apic_get_tmcct(apic); break; case APIC_PROCPRI: apic_update_ppr(apic); val = kvm_lapic_get_reg(apic, offset); break; case APIC_TASKPRI: report_tpr_access(apic, false); fallthrough; default: val = kvm_lapic_get_reg(apic, offset); break; } return val; } static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) { return container_of(dev, struct kvm_lapic, dev); } #define APIC_REG_MASK(reg) (1ull << ((reg) >> 4)) #define APIC_REGS_MASK(first, count) \ (APIC_REG_MASK(first) * ((1ull << (count)) - 1)) u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) { /* Leave bits '0' for reserved and write-only registers. */ u64 valid_reg_mask = APIC_REG_MASK(APIC_ID) | APIC_REG_MASK(APIC_LVR) | APIC_REG_MASK(APIC_TASKPRI) | APIC_REG_MASK(APIC_PROCPRI) | APIC_REG_MASK(APIC_LDR) | APIC_REG_MASK(APIC_SPIV) | APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) | APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) | APIC_REGS_MASK(APIC_IRR, APIC_ISR_NR) | APIC_REG_MASK(APIC_ESR) | APIC_REG_MASK(APIC_ICR) | APIC_REG_MASK(APIC_LVTT) | APIC_REG_MASK(APIC_LVTTHMR) | APIC_REG_MASK(APIC_LVTPC) | APIC_REG_MASK(APIC_LVT0) | APIC_REG_MASK(APIC_LVT1) | APIC_REG_MASK(APIC_LVTERR) | APIC_REG_MASK(APIC_TMICT) | APIC_REG_MASK(APIC_TMCCT) | APIC_REG_MASK(APIC_TDCR); if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */ if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) | APIC_REG_MASK(APIC_DFR) | APIC_REG_MASK(APIC_ICR2); return valid_reg_mask; } EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask); static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) { unsigned char alignment = offset & 0xf; u32 result; /* * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in * x2APIC and needs to be manually handled by the caller. */ WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR); if (alignment + len > 4) return 1; if (offset > 0x3f0 || !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset))) return 1; result = __apic_read(apic, offset & ~0xf); trace_kvm_apic_read(offset, result); switch (len) { case 1: case 2: case 4: memcpy(data, (char *)&result + alignment, len); break; default: printk(KERN_ERR "Local APIC read with len = %x, " "should be 1,2, or 4 instead\n", len); break; } return 0; } static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { return addr >= apic->base_address && addr < apic->base_address + LAPIC_MMIO_LENGTH; } static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, void *data) { struct kvm_lapic *apic = to_lapic(this); u32 offset = address - apic->base_address; if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) return -EOPNOTSUPP; memset(data, 0xff, len); return 0; } kvm_lapic_reg_read(apic, offset, len, data); return 0; } static void update_divide_count(struct kvm_lapic *apic) { u32 tmp1, tmp2, tdcr; tdcr = kvm_lapic_get_reg(apic, APIC_TDCR); tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; apic->divide_count = 0x1 << (tmp2 & 0x7); } static void limit_periodic_timer_frequency(struct kvm_lapic *apic) { /* * Do not allow the guest to program periodic timers with small * interval, since the hrtimers are not throttled by the host * scheduler. */ if (apic_lvtt_period(apic) && apic->lapic_timer.period) { s64 min_period = min_timer_period_us * 1000LL; if (apic->lapic_timer.period < min_period) { pr_info_once( "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, apic->lapic_timer.period, min_period); apic->lapic_timer.period = min_period; } } } static void cancel_hv_timer(struct kvm_lapic *apic); static void cancel_apic_timer(struct kvm_lapic *apic) { hrtimer_cancel(&apic->lapic_timer.timer); preempt_disable(); if (apic->lapic_timer.hv_timer_in_use) cancel_hv_timer(apic); preempt_enable(); atomic_set(&apic->lapic_timer.pending, 0); } static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & apic->lapic_timer.timer_mode_mask; if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; } apic->lapic_timer.timer_mode = timer_mode; limit_periodic_timer_frequency(apic); } } /* * On APICv, this test will cause a busy wait * during a higher-priority task. */ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; if (apic->apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) return true; } return false; } static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) { u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns; /* * If the guest TSC is running at a different ratio than the host, then * convert the delay to nanoseconds to achieve an accurate delay. Note * that __delay() uses delay_tsc whenever the hardware has TSC, thus * always for VMX enabled hardware. */ if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) { __delay(min(guest_cycles, nsec_to_cycles(vcpu, timer_advance_ns))); } else { u64 delay_ns = guest_cycles * 1000000ULL; do_div(delay_ns, vcpu->arch.virtual_tsc_khz); ndelay(min_t(u32, delay_ns, timer_advance_ns)); } } static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, s64 advance_expire_delta) { struct kvm_lapic *apic = vcpu->arch.apic; u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 ns; /* Do not adjust for tiny fluctuations or large random spikes. */ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX || abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN) return; /* too early */ if (advance_expire_delta < 0) { ns = -advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } else { /* too late */ ns = advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); /* * If the timer fired early, reread the TSC to account for the overhead * of the above adjustment to avoid waiting longer than is necessary. */ if (guest_tsc < tsc_deadline) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); } void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu) && vcpu->arch.apic->lapic_timer.expired_tscdeadline && vcpu->arch.apic->lapic_timer.timer_advance_ns && lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) { ktimer->tscdeadline = 0; } else if (apic_lvtt_oneshot(apic)) { ktimer->tscdeadline = 0; ktimer->target_expiration = 0; } } static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) { struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_timer *ktimer = &apic->lapic_timer; if (atomic_read(&apic->lapic_timer.pending)) return; if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; if (!from_timer_fn && apic->apicv_active) { WARN_ON(kvm_get_running_vcpu() != vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { /* * Ensure the guest's timer has truly expired before posting an * interrupt. Open code the relevant checks to avoid querying * lapic_timer_int_injected(), which will be false since the * interrupt isn't yet injected. Waiting until after injecting * is not an option since that won't help a posted interrupt. */ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && vcpu->arch.apic->lapic_timer.timer_advance_ns) __kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } atomic_inc(&apic->lapic_timer.pending); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); if (from_timer_fn) kvm_vcpu_kick(vcpu); } static void start_sw_tscdeadline(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; u64 guest_tsc, tscdeadline = ktimer->tscdeadline; u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; ktime_t now; if (unlikely(!tscdeadline || !this_tsc_khz)) return; local_irq_save(flags); now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); if (likely(tscdeadline > guest_tsc) && likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD); } else apic_timer_expired(apic, false); local_irq_restore(flags); } static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict) { return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns * (u64)apic->divide_count; } static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) { ktime_t now, remaining; u64 ns_remaining_old, ns_remaining_new; apic->lapic_timer.period = tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT)); limit_periodic_timer_frequency(apic); now = ktime_get(); remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) remaining = 0; ns_remaining_old = ktime_to_ns(remaining); ns_remaining_new = mul_u64_u32_div(ns_remaining_old, apic->divide_count, old_divisor); apic->lapic_timer.tscdeadline += nsec_to_cycles(apic->vcpu, ns_remaining_new) - nsec_to_cycles(apic->vcpu, ns_remaining_old); apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new); } static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) { ktime_t now; u64 tscl = rdtsc(); s64 deadline; now = ktime_get(); apic->lapic_timer.period = tmict_to_ns(apic, kvm_lapic_get_reg(apic, APIC_TMICT)); if (!apic->lapic_timer.period) { apic->lapic_timer.tscdeadline = 0; return false; } limit_periodic_timer_frequency(apic); deadline = apic->lapic_timer.period; if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { if (unlikely(count_reg != APIC_TMICT)) { deadline = tmict_to_ns(apic, kvm_lapic_get_reg(apic, count_reg)); if (unlikely(deadline <= 0)) { if (apic_lvtt_period(apic)) deadline = apic->lapic_timer.period; else deadline = 0; } else if (unlikely(deadline > apic->lapic_timer.period)) { pr_info_ratelimited( "vcpu %i: requested lapic timer restore with " "starting count register %#x=%u (%lld ns) > initial count (%lld ns). " "Using initial count to start timer.\n", apic->vcpu->vcpu_id, count_reg, kvm_lapic_get_reg(apic, count_reg), deadline, apic->lapic_timer.period); kvm_lapic_set_reg(apic, count_reg, 0); deadline = apic->lapic_timer.period; } } } apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + nsec_to_cycles(apic->vcpu, deadline); apic->lapic_timer.target_expiration = ktime_add_ns(now, deadline); return true; } static void advance_periodic_target_expiration(struct kvm_lapic *apic) { ktime_t now = ktime_get(); u64 tscl = rdtsc(); ktime_t delta; /* * Synchronize both deadlines to the same time source or * differences in the periods (caused by differences in the * underlying clocks or numerical approximation errors) will * cause the two to drift apart over time as the errors * accumulate. */ apic->lapic_timer.target_expiration = ktime_add_ns(apic->lapic_timer.target_expiration, apic->lapic_timer.period); delta = ktime_sub(apic->lapic_timer.target_expiration, now); apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + nsec_to_cycles(apic->vcpu, delta); } static void start_sw_period(struct kvm_lapic *apic) { if (!apic->lapic_timer.period) return; if (ktime_after(ktime_get(), apic->lapic_timer.target_expiration)) { apic_timer_expired(apic, false); if (apic_lvtt_oneshot(apic)) return; advance_periodic_target_expiration(apic); } hrtimer_start(&apic->lapic_timer.timer, apic->lapic_timer.target_expiration, HRTIMER_MODE_ABS_HARD); } bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) return false; return vcpu->arch.apic->lapic_timer.hv_timer_in_use; } static void cancel_hv_timer(struct kvm_lapic *apic) { WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); kvm_x86_call(cancel_hv_timer)(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } static bool start_hv_timer(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; struct kvm_vcpu *vcpu = apic->vcpu; bool expired; WARN_ON(preemptible()); if (!kvm_can_use_hv_timer(vcpu)) return false; if (!ktimer->tscdeadline) return false; if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; hrtimer_cancel(&ktimer->timer); /* * To simplify handling the periodic timer, leave the hv timer running * even if the deadline timer has expired, i.e. rely on the resulting * VM-Exit to recompute the periodic timer's target expiration. */ if (!apic_lvtt_period(apic)) { /* * Cancel the hv timer if the sw timer fired while the hv timer * was being programmed, or if the hv timer itself expired. */ if (atomic_read(&ktimer->pending)) { cancel_hv_timer(apic); } else if (expired) { apic_timer_expired(apic, false); cancel_hv_timer(apic); } } trace_kvm_hv_timer_state(vcpu->vcpu_id, ktimer->hv_timer_in_use); return true; } static void start_sw_timer(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; WARN_ON(preemptible()); if (apic->lapic_timer.hv_timer_in_use) cancel_hv_timer(apic); if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) return; if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) start_sw_period(apic); else if (apic_lvtt_tscdeadline(apic)) start_sw_tscdeadline(apic); trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false); } static void restart_apic_timer(struct kvm_lapic *apic) { preempt_disable(); if (!apic_lvtt_period(apic) && atomic_read(&apic->lapic_timer.pending)) goto out; if (!start_hv_timer(apic)) start_sw_timer(apic); out: preempt_enable(); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; preempt_disable(); /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out; WARN_ON(kvm_vcpu_is_blocking(vcpu)); apic_timer_expired(apic, false); cancel_hv_timer(apic); if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); restart_apic_timer(apic); } out: preempt_enable(); } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { restart_apic_timer(vcpu->arch.apic); } void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; preempt_disable(); /* Possibly the TSC deadline timer is not enabled yet */ if (apic->lapic_timer.hv_timer_in_use) start_sw_timer(apic); preempt_enable(); } void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(!apic->lapic_timer.hv_timer_in_use); restart_apic_timer(apic); } static void __start_apic_timer(struct kvm_lapic *apic, u32 count_reg) { atomic_set(&apic->lapic_timer.pending, 0); if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) && !set_target_expiration(apic, count_reg)) return; restart_apic_timer(apic); } static void start_apic_timer(struct kvm_lapic *apic) { __start_apic_timer(apic, APIC_TMICT); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) { bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val); if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) { apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode; if (lvt0_in_nmi_mode) { atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); } else atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); } } static int get_lvt_index(u32 reg) { if (reg == APIC_LVTCMCI) return LVT_CMCI; if (reg < APIC_LVTT || reg > APIC_LVTERR) return -1; return array_index_nospec( (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES); } static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; trace_kvm_apic_write(reg, val); switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) { kvm_apic_set_xapic_id(apic, val >> 24); } else { ret = 1; } break; case APIC_TASKPRI: report_tpr_access(apic, true); apic_set_tpr(apic, val & 0xff); break; case APIC_EOI: apic_set_eoi(apic); break; case APIC_LDR: if (!apic_x2apic_mode(apic)) kvm_apic_set_ldr(apic, val & APIC_LDR_MASK); else ret = 1; break; case APIC_DFR: if (!apic_x2apic_mode(apic)) kvm_apic_set_dfr(apic, val | 0x0FFFFFFF); else ret = 1; break; case APIC_SPIV: { u32 mask = 0x3ff; if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) mask |= APIC_SPIV_DIRECTED_EOI; apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; for (i = 0; i < apic->nr_lvt_entries; i++) { kvm_lapic_set_reg(apic, APIC_LVTx(i), kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED); } apic_update_lvtt(apic); atomic_set(&apic->lapic_timer.pending, 0); } break; } case APIC_ICR: WARN_ON_ONCE(apic_x2apic_mode(apic)); /* No delay here, so we always clear the pending bit */ val &= ~APIC_ICR_BUSY; kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2)); kvm_lapic_set_reg(apic, APIC_ICR, val); break; case APIC_ICR2: if (apic_x2apic_mode(apic)) ret = 1; else kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000); break; case APIC_LVT0: apic_manage_nmi_watchdog(apic, val); fallthrough; case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: case APIC_LVTERR: case APIC_LVTCMCI: { u32 index = get_lvt_index(reg); if (!kvm_lapic_lvt_supported(apic, index)) { ret = 1; break; } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); break; } case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask); kvm_lapic_set_reg(apic, APIC_LVTT, val); apic_update_lvtt(apic); break; case APIC_TMICT: if (apic_lvtt_tscdeadline(apic)) break; cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; case APIC_TDCR: { uint32_t old_divisor = apic->divide_count; kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb); update_divide_count(apic); if (apic->divide_count != old_divisor && apic->lapic_timer.period) { hrtimer_cancel(&apic->lapic_timer.timer); update_target_expiration(apic, old_divisor); restart_apic_timer(apic); } break; } case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) ret = 1; break; case APIC_SELF_IPI: /* * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold * the vector, everything else is reserved. */ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK)) ret = 1; else kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0); break; default: ret = 1; break; } /* * Recalculate APIC maps if necessary, e.g. if the software enable bit * was toggled, the APIC ID changed, etc... The maps are marked dirty * on relevant changes, i.e. this is a nop for most writes. */ kvm_recalculate_apic_map(apic->vcpu->kvm); return ret; } static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t address, int len, const void *data) { struct kvm_lapic *apic = to_lapic(this); unsigned int offset = address - apic->base_address; u32 val; if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) return -EOPNOTSUPP; return 0; } /* * APIC register must be aligned on 128-bits boundary. * 32/64/128 bits registers must be accessed thru 32 bits. * Refer SDM 8.4.1 */ if (len != 4 || (offset & 0xf)) return 0; val = *(u32*)data; kvm_lapic_reg_write(apic, offset & 0xff0, val); return 0; } void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); #define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) { if (data & X2APIC_ICR_RESERVED_BITS) return 1; /* * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but * only AMD requires it to be zero, Intel essentially just ignores the * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, * the CPU performs the reserved bits checks, i.e. the underlying CPU * behavior will "win". Arbitrarily clear the BUSY bit, as there is no * sane way to provide consistent behavior with respect to hardware. */ data &= ~APIC_ICR_BUSY; kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); if (kvm_x86_ops.x2apic_icr_is_split) { kvm_lapic_set_reg(apic, APIC_ICR, data); kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); } else { kvm_lapic_set_reg64(apic, APIC_ICR, data); } trace_kvm_apic_write(APIC_ICR, data); return 0; } static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) { if (kvm_x86_ops.x2apic_icr_is_split) return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; return kvm_lapic_get_reg64(apic, APIC_ICR); } /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { struct kvm_lapic *apic = vcpu->arch.apic; /* * ICR is a single 64-bit register when x2APIC is enabled, all others * registers hold 32-bit values. For legacy xAPIC, ICR writes need to * go down the common path to get the upper half from ICR2. * * Note, using the write helpers may incur an unnecessary write to the * virtual APIC state, but KVM needs to conditionally modify the value * in certain cases, e.g. to clear the ICR busy bit. The cost of extra * conditional branches is likely a wash relative to the cost of the * maybe-unecessary write, and both are in the noise anyways. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); void kvm_free_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (!vcpu->arch.apic) { static_branch_dec(&kvm_has_noapic_vcpu); return; } hrtimer_cancel(&apic->lapic_timer.timer); if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)) static_branch_slow_dec_deferred(&apic_hw_disabled); if (!apic->sw_enabled) static_branch_slow_dec_deferred(&apic_sw_disabled); if (apic->regs) free_page((unsigned long)apic->regs); kfree(apic); } /* *---------------------------------------------------------------------- * LAPIC interface *---------------------------------------------------------------------- */ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; } void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return; hrtimer_cancel(&apic->lapic_timer.timer); apic->lapic_timer.tscdeadline = data; start_apic_timer(apic); } void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4); } u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { u64 tpr; tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; } static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value) { u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) kvm_update_cpuid_runtime(vcpu); if (!apic) return; /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_branch_slow_dec_deferred(&apic_hw_disabled); /* Check if there are APF page ready requests pending */ kvm_make_request(KVM_REQ_APF_READY, vcpu); } else { static_branch_inc(&apic_hw_disabled.key); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); } } if ((old_value ^ value) & X2APIC_ENABLE) { if (value & X2APIC_ENABLE) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); else if (value & MSR_IA32_APICBASE_ENABLE) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); kvm_x86_call(set_virtual_apic_mode)(vcpu); } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; if ((value & MSR_IA32_APICBASE_ENABLE) && apic->base_address != APIC_DEFAULT_PHYS_BASE) { kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); } } int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(value); if (vcpu->arch.apic_base == value) return 0; u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) return 1; if (!host_initiated) { if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) return 1; if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) return 1; } __kvm_apic_set_base(vcpu, value); kvm_recalculate_apic_map(vcpu->kvm); return 0; } void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; /* * When APICv is enabled, KVM must always search the IRR for a pending * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU * isn't running. If APICv is disabled, KVM _should_ search the IRR * for a pending IRQ. But KVM currently doesn't ensure *all* hardware, * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching * the IRR at this time could race with IRQ delivery from hardware that * still sees APICv as being enabled. * * FIXME: Ensure other vCPUs and devices observe the change in APICv * state prior to updating KVM's metadata caches, so that KVM * can safely search the IRR and set irr_pending accordingly. */ apic->irr_pending = true; if (apic->apicv_active) apic->isr_count = 1; else apic->isr_count = count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; } int kvm_alloc_apic_access_page(struct kvm *kvm) { void __user *hva; int ret = 0; mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_memslot_enabled || kvm->arch.apic_access_memslot_inhibited) goto out; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); if (IS_ERR(hva)) { ret = PTR_ERR(hva); goto out; } kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return ret; } EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page); void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; if (!kvm->arch.apic_access_memslot_enabled) return; kvm_vcpu_srcu_read_unlock(vcpu); mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_memslot_enabled) { __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); /* * Clear "enabled" after the memslot is deleted so that a * different vCPU doesn't get a false negative when checking * the flag out of slots_lock. No additional memory barrier is * needed as modifying memslots requires waiting other vCPUs to * drop SRCU (see above), and false positives are ok as the * flag is rechecked after acquiring slots_lock. */ kvm->arch.apic_access_memslot_enabled = false; /* * Mark the memslot as inhibited to prevent reallocating the * memslot during vCPU creation, e.g. if a vCPU is hotplugged. */ kvm->arch.apic_access_memslot_inhibited = true; } mutex_unlock(&kvm->slots_lock); kvm_vcpu_srcu_read_lock(vcpu); } void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; u64 msr_val; int i; kvm_x86_call(apicv_pre_state_restore)(vcpu); if (!init_event) { msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) msr_val |= MSR_IA32_APICBASE_BSP; /* * Use the inner helper to avoid an extra recalcuation of the * optimized APIC map if some other task has dirtied the map. * The recalculation needed for this vCPU will be done after * all APIC state has been initialized (see below). */ __kvm_apic_set_base(vcpu, msr_val); } if (!apic) return; /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); /* The xAPIC ID is set at RESET even if the APIC was already enabled. */ if (!init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); for (i = 0; i < apic->nr_lvt_entries; i++) kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) kvm_lapic_set_reg(apic, APIC_LVT0, SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); kvm_apic_set_dfr(apic, 0xffffffffU); apic_set_spiv(apic, 0xff); kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) kvm_apic_set_ldr(apic, 0); kvm_lapic_set_reg(apic, APIC_ESR, 0); if (!apic_x2apic_mode(apic)) { kvm_lapic_set_reg(apic, APIC_ICR, 0); kvm_lapic_set_reg(apic, APIC_ICR2, 0); } else { kvm_lapic_set_reg64(apic, APIC_ICR, 0); } kvm_lapic_set_reg(apic, APIC_TDCR, 0); kvm_lapic_set_reg(apic, APIC_TMICT, 0); for (i = 0; i < 8; i++) { kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0); kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0); kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } kvm_apic_update_apicv(vcpu); update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (apic->apicv_active) { kvm_x86_call(apicv_post_state_restore)(vcpu); kvm_x86_call(hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; kvm_recalculate_apic_map(vcpu->kvm); } /* *---------------------------------------------------------------------- * timer interface *---------------------------------------------------------------------- */ static bool lapic_is_periodic(struct kvm_lapic *apic) { return apic_lvtt_period(apic); } int apic_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT)) return atomic_read(&apic->lapic_timer.pending); return 0; } int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; int r; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); if (r && lvt_type == APIC_LVTPC && guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } return 0; } void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (apic) kvm_apic_local_deliver(apic, APIC_LVT0); } static const struct kvm_io_device_ops apic_mmio_ops = { .read = apic_mmio_read, .write = apic_mmio_write, }; static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) { struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer); apic_timer_expired(apic, true); if (lapic_is_periodic(apic)) { advance_periodic_target_expiration(apic); hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; } else return HRTIMER_NORESTART; } int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; ASSERT(vcpu != NULL); if (!irqchip_in_kernel(vcpu->kvm)) { static_branch_inc(&kvm_has_noapic_vcpu); return 0; } apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) goto nomem; vcpu->arch.apic = apic; if (kvm_x86_ops.alloc_apic_backing_page) apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu); else apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } apic->vcpu = vcpu; apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; /* * Stuff the APIC ENABLE bit in lieu of temporarily incrementing * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset(). */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); /* * Defer evaluating inhibits until the vCPU is first run, as this vCPU * will not get notified of any changes until this vCPU is visible to * other vCPUs (marked online and added to the set of vCPUs). * * Opportunistically mark APICv active as VMX in particularly is highly * unlikely to have inhibits. Ignore the current per-VM APICv state so * that vCPU creation is guaranteed to run with a deterministic value, * the request will ensure the vCPU gets the correct state before VM-Entry. */ if (enable_apicv) { apic->apicv_active = true; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } return 0; nomem_free_apic: kfree(apic); vcpu->arch.apic = NULL; nomem: return -ENOMEM; } int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; if (!kvm_apic_present(vcpu)) return -1; __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } EXPORT_SYMBOL_GPL(kvm_apic_has_interrupt); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) return 1; return 0; } void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; if (atomic_read(&apic->lapic_timer.pending) > 0) { kvm_apic_inject_pending_timer_irqs(apic); atomic_set(&apic->lapic_timer.pending, 0); } } void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) { struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; if (WARN_ON_ONCE(vector < 0 || !apic)) return; /* * We get here even with APIC virtualization enabled, if doing * nested virtualization and L1 runs with the "acknowledge interrupt * on exit" mode. Then we cannot inject the interrupt via RVI, * because the process would deliver it through the IDT. */ apic_clear_irr(vector, apic); if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) { /* * For auto-EOI interrupts, there might be another pending * interrupt above PPR, so check whether to raise another * KVM_REQ_EVENT. */ apic_update_ppr(apic); } else { /* * For normal interrupts, PPR has been raised and there cannot * be a higher-priority pending interrupt---except if there was * a concurrent interrupt injection, but that would have * triggered KVM_REQ_EVENT already. */ apic_set_isr(vector, apic); __apic_update_ppr(apic, &ppr); } } EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt); static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) { if (apic_x2apic_mode(vcpu->arch.apic)) { u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic); u32 *id = (u32 *)(s->regs + APIC_ID); u32 *ldr = (u32 *)(s->regs + APIC_LDR); u64 icr; if (vcpu->kvm->arch.x2apic_format) { if (*id != x2apic_id) return -EINVAL; } else { /* * Ignore the userspace value when setting APIC state. * KVM's model is that the x2APIC ID is readonly, e.g. * KVM only supports delivering interrupts to KVM's * version of the x2APIC ID. However, for backwards * compatibility, don't reject attempts to set a * mismatched ID for userspace that hasn't opted into * x2apic_format. */ if (set) *id = x2apic_id; else *id = x2apic_id << 24; } /* * In x2APIC mode, the LDR is fixed and based on the id. And * if the ICR is _not_ split, ICR is internally a single 64-bit * register, but needs to be split to ICR+ICR2 in userspace for * backwards compatibility. */ if (set) *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); if (!kvm_x86_ops.x2apic_icr_is_split) { if (set) { icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); } else { icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); } } } return 0; } int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); /* * Get calculated timer current count for remaining timer period (if * any) and store it in the returned register set. */ __kvm_lapic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT)); return kvm_apic_state_fixup(vcpu, s, false); } int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; int r; kvm_x86_call(apicv_pre_state_restore)(vcpu); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); r = kvm_apic_state_fixup(vcpu, s, true); if (r) { kvm_recalculate_apic_map(vcpu->kvm); return r; } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY); kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); apic_update_ppr(apic); cancel_apic_timer(apic); apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); __start_apic_timer(apic, APIC_TMCCT); kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); if (apic->apicv_active) { kvm_x86_call(apicv_post_state_restore)(vcpu); kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_rtc_eoi_tracking_restore_one(vcpu); vcpu->arch.apic_arb_prio = 0; return 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct hrtimer *timer; if (!lapic_in_kernel(vcpu) || kvm_can_post_timer_interrupt(vcpu)) return; timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD); } /* * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt * * Detect whether guest triggered PV EOI since the * last entry. If yes, set EOI on guests's behalf. * Clear PV EOI in guest memory in any case. */ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, struct kvm_lapic *apic) { int vector; /* * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host * and KVM_PV_EOI_ENABLED in guest memory as follows: * * KVM_APIC_PV_EOI_PENDING is unset: * -> host disabled PV EOI. * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: * -> host enabled PV EOI, guest did not execute EOI yet. * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: * -> host enabled PV EOI, guest executed EOI. */ BUG_ON(!pv_eoi_enabled(vcpu)); if (pv_eoi_test_and_clr_pending(vcpu)) return; vector = apic_set_eoi(apic); trace_kvm_pv_eoi(apic, vector); } void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); } /* * apic_sync_pv_eoi_to_guest - called before vmentry * * Detect whether it's safe to enable PV EOI and * if yes do so. */ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, struct kvm_lapic *apic) { if (!pv_eoi_enabled(vcpu) || /* IRR set or many bits in ISR: could be nested. */ apic->irr_pending || /* Cache not set: could be safe but we don't bother. */ apic->highest_isr_cache == -1 || /* Need EOI to update ioapic. */ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { /* * PV EOI was disabled by apic_sync_pv_eoi_from_guest * so we need not do anything here. */ return; } pv_eoi_set_pending(apic->vcpu); } void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; struct kvm_lapic *apic = vcpu->arch.apic; apic_sync_pv_eoi_to_guest(vcpu, apic); if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) max_irr = 0; max_isr = apic_find_highest_isr(apic); if (max_isr < 0) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } else { __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } vcpu->arch.apic->vapic_addr = vapic_addr; return 0; } static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) { u32 low; if (reg == APIC_ICR) { *data = kvm_x2apic_icr_read(apic); return 0; } if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; *data = low; return 0; } static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data) { /* * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and * can be written as such, all other registers remain accessible only * through 32-bit reads/writes. */ if (reg == APIC_ICR) return kvm_x2apic_icr_write(apic, data); /* Bits 63:32 are reserved in all other registers. */ if (data >> 32) return 1; return kvm_lapic_reg_write(apic, reg, (u32)data); } int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; return kvm_lapic_msr_write(apic, reg, data); } int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) { struct kvm_lapic *apic = vcpu->arch.apic; u32 reg = (msr - APIC_BASE_MSR) << 4; if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; return kvm_lapic_msr_read(apic, reg, data); } int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) { if (!lapic_in_kernel(vcpu)) return 1; return kvm_lapic_msr_write(vcpu->arch.apic, reg, data); } int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) { if (!lapic_in_kernel(vcpu)) return 1; return kvm_lapic_msr_read(vcpu->arch.apic, reg, data); } int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; unsigned long new_len; int ret; if (!IS_ALIGNED(addr, 4)) return 1; if (data & KVM_MSR_ENABLED) { if (addr == ghc->gpa && len <= ghc->len) new_len = ghc->len; else new_len = len; ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); if (ret) return ret; } vcpu->arch.pv_eoi.msr_val = data; return 0; } int kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u8 sipi_vector; int r; if (!kvm_apic_has_pending_init_or_sipi(vcpu)) return 0; if (is_guest_mode(vcpu)) { r = kvm_check_nested_events(vcpu); if (r < 0) return r == -EBUSY ? 0 : r; /* * Continue processing INIT/SIPI even if a nested VM-Exit * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI * are blocked as a result of transitioning to VMX root mode. */ } /* * INITs are blocked while CPU is in specific states (SMM, VMX root * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in * wait-for-SIPI (WFS). */ if (!kvm_apic_init_sipi_allowed(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); clear_bit(KVM_APIC_SIPI, &apic->pending_events); return 0; } if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; } if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } return 0; } void kvm_lapic_exit(void) { static_key_deferred_flush(&apic_hw_disabled); WARN_ON(static_branch_unlikely(&apic_hw_disabled.key)); static_key_deferred_flush(&apic_sw_disabled); WARN_ON(static_branch_unlikely(&apic_sw_disabled.key)); }
20 20 2 2 20 20 20 20 20 20 20 20 2 1 2 2 54 50 50 25 25 22 8 6 17 16 11 11 10 10 12 12 10 10 12 12 8 7 4 5 4 8 17 11 12 7 5 12 15 3 2 1 39 69 37 136 1 3 2 1 135 2 1 2 1 11 10 5 4 2 1 2 1 6 6 6 1 5 9 1 2 2 2 3 3 3 3 18 18 18 18 18 3 3 3 3 3 4 1 4 4 4 4 4 12 12 12 12 13 13 13 13 1 1 1 1 2 2 2 2 2 2 2 2 2 6 6 6 6 4 4 4 4 4 2 2 2 2 1 2 2 2 2 1 1 1 1 5 1 1 1 1 135 135 134 69 69 66 71 135 134 135 8 7 7 8 5 5 1 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 20 20 20 20 15 20 15 20 2 2 38 39 39 39 39 11 11 11 11 11 7 7 7 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 // SPDX-License-Identifier: GPL-2.0-only /* * V4L2 sub-device * * Copyright (C) 2010 Nokia Corporation * * Contact: Laurent Pinchart <laurent.pinchart@ideasonboard.com> * Sakari Ailus <sakari.ailus@iki.fi> */ #include <linux/export.h> #include <linux/ioctl.h> #include <linux/leds.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/overflow.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> #include <linux/version.h> #include <linux/videodev2.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-device.h> #include <media/v4l2-event.h> #include <media/v4l2-fh.h> #include <media/v4l2-ioctl.h> #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) /* * The Streams API is an experimental feature. To use the Streams API, set * 'v4l2_subdev_enable_streams_api' to 1 below. */ static bool v4l2_subdev_enable_streams_api; #endif /* * Maximum stream ID is 63 for now, as we use u64 bitmask to represent a set * of streams. * * Note that V4L2_FRAME_DESC_ENTRY_MAX is related: V4L2_FRAME_DESC_ENTRY_MAX * restricts the total number of streams in a pad, although the stream ID is * not restricted. */ #define V4L2_SUBDEV_MAX_STREAM_ID 63 #include "v4l2-subdev-priv.h" #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) static int subdev_fh_init(struct v4l2_subdev_fh *fh, struct v4l2_subdev *sd) { struct v4l2_subdev_state *state; static struct lock_class_key key; state = __v4l2_subdev_state_alloc(sd, "fh->state->lock", &key); if (IS_ERR(state)) return PTR_ERR(state); fh->state = state; return 0; } static void subdev_fh_free(struct v4l2_subdev_fh *fh) { __v4l2_subdev_state_free(fh->state); fh->state = NULL; } static int subdev_open(struct file *file) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_subdev_fh *subdev_fh; int ret; subdev_fh = kzalloc(sizeof(*subdev_fh), GFP_KERNEL); if (subdev_fh == NULL) return -ENOMEM; ret = subdev_fh_init(subdev_fh, sd); if (ret) { kfree(subdev_fh); return ret; } v4l2_fh_init(&subdev_fh->vfh, vdev); v4l2_fh_add(&subdev_fh->vfh); file->private_data = &subdev_fh->vfh; if (sd->v4l2_dev->mdev && sd->entity.graph_obj.mdev->dev) { struct module *owner; owner = sd->entity.graph_obj.mdev->dev->driver->owner; if (!try_module_get(owner)) { ret = -EBUSY; goto err; } subdev_fh->owner = owner; } if (sd->internal_ops && sd->internal_ops->open) { ret = sd->internal_ops->open(sd, subdev_fh); if (ret < 0) goto err; } return 0; err: module_put(subdev_fh->owner); v4l2_fh_del(&subdev_fh->vfh); v4l2_fh_exit(&subdev_fh->vfh); subdev_fh_free(subdev_fh); kfree(subdev_fh); return ret; } static int subdev_close(struct file *file) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_fh *vfh = file->private_data; struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); if (sd->internal_ops && sd->internal_ops->close) sd->internal_ops->close(sd, subdev_fh); module_put(subdev_fh->owner); v4l2_fh_del(vfh); v4l2_fh_exit(vfh); subdev_fh_free(subdev_fh); kfree(subdev_fh); file->private_data = NULL; return 0; } #else /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static int subdev_open(struct file *file) { return -ENODEV; } static int subdev_close(struct file *file) { return -ENODEV; } #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static void v4l2_subdev_enable_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) if (!IS_ERR_OR_NULL(sd->privacy_led)) led_set_brightness(sd->privacy_led, sd->privacy_led->max_brightness); #endif } static void v4l2_subdev_disable_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) if (!IS_ERR_OR_NULL(sd->privacy_led)) led_set_brightness(sd->privacy_led, 0); #endif } static inline int check_which(u32 which) { if (which != V4L2_SUBDEV_FORMAT_TRY && which != V4L2_SUBDEV_FORMAT_ACTIVE) return -EINVAL; return 0; } static inline int check_pad(struct v4l2_subdev *sd, u32 pad) { #if defined(CONFIG_MEDIA_CONTROLLER) if (sd->entity.num_pads) { if (pad >= sd->entity.num_pads) return -EINVAL; return 0; } #endif /* allow pad 0 on subdevices not registered as media entities */ if (pad > 0) return -EINVAL; return 0; } static int check_state(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, u32 which, u32 pad, u32 stream) { if (sd->flags & V4L2_SUBDEV_FL_STREAMS) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) if (!v4l2_subdev_state_get_format(state, pad, stream)) return -EINVAL; return 0; #else return -EINVAL; #endif } if (stream != 0) return -EINVAL; if (which == V4L2_SUBDEV_FORMAT_TRY && (!state || !state->pads)) return -EINVAL; return 0; } static inline int check_format(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format) { if (!format) return -EINVAL; return check_which(format->which) ? : check_pad(sd, format->pad) ? : check_state(sd, state, format->which, format->pad, format->stream); } static int call_get_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format) { return check_format(sd, state, format) ? : sd->ops->pad->get_fmt(sd, state, format); } static int call_set_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format) { return check_format(sd, state, format) ? : sd->ops->pad->set_fmt(sd, state, format); } static int call_enum_mbus_code(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_mbus_code_enum *code) { if (!code) return -EINVAL; return check_which(code->which) ? : check_pad(sd, code->pad) ? : check_state(sd, state, code->which, code->pad, code->stream) ? : sd->ops->pad->enum_mbus_code(sd, state, code); } static int call_enum_frame_size(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_size_enum *fse) { if (!fse) return -EINVAL; return check_which(fse->which) ? : check_pad(sd, fse->pad) ? : check_state(sd, state, fse->which, fse->pad, fse->stream) ? : sd->ops->pad->enum_frame_size(sd, state, fse); } static int call_enum_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval_enum *fie) { if (!fie) return -EINVAL; return check_which(fie->which) ? : check_pad(sd, fie->pad) ? : check_state(sd, state, fie->which, fie->pad, fie->stream) ? : sd->ops->pad->enum_frame_interval(sd, state, fie); } static inline int check_selection(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_selection *sel) { if (!sel) return -EINVAL; return check_which(sel->which) ? : check_pad(sd, sel->pad) ? : check_state(sd, state, sel->which, sel->pad, sel->stream); } static int call_get_selection(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_selection *sel) { return check_selection(sd, state, sel) ? : sd->ops->pad->get_selection(sd, state, sel); } static int call_set_selection(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_selection *sel) { return check_selection(sd, state, sel) ? : sd->ops->pad->set_selection(sd, state, sel); } static inline int check_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi) { if (!fi) return -EINVAL; return check_which(fi->which) ? : check_pad(sd, fi->pad) ? : check_state(sd, state, fi->which, fi->pad, fi->stream); } static int call_get_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi) { return check_frame_interval(sd, state, fi) ? : sd->ops->pad->get_frame_interval(sd, state, fi); } static int call_set_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi) { return check_frame_interval(sd, state, fi) ? : sd->ops->pad->set_frame_interval(sd, state, fi); } static int call_get_frame_desc(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_mbus_frame_desc *fd) { unsigned int i; int ret; #if defined(CONFIG_MEDIA_CONTROLLER) if (!(sd->entity.pads[pad].flags & MEDIA_PAD_FL_SOURCE)) return -EOPNOTSUPP; #endif memset(fd, 0, sizeof(*fd)); ret = sd->ops->pad->get_frame_desc(sd, pad, fd); if (ret) return ret; dev_dbg(sd->dev, "Frame descriptor on pad %u, type %s\n", pad, fd->type == V4L2_MBUS_FRAME_DESC_TYPE_PARALLEL ? "parallel" : fd->type == V4L2_MBUS_FRAME_DESC_TYPE_CSI2 ? "CSI-2" : "unknown"); for (i = 0; i < fd->num_entries; i++) { struct v4l2_mbus_frame_desc_entry *entry = &fd->entry[i]; char buf[20] = ""; if (fd->type == V4L2_MBUS_FRAME_DESC_TYPE_CSI2) WARN_ON(snprintf(buf, sizeof(buf), ", vc %u, dt 0x%02x", entry->bus.csi2.vc, entry->bus.csi2.dt) >= sizeof(buf)); dev_dbg(sd->dev, "\tstream %u, code 0x%04x, length %u, flags 0x%04x%s\n", entry->stream, entry->pixelcode, entry->length, entry->flags, buf); } return 0; } static inline int check_edid(struct v4l2_subdev *sd, struct v4l2_subdev_edid *edid) { if (!edid) return -EINVAL; if (edid->blocks && edid->edid == NULL) return -EINVAL; return check_pad(sd, edid->pad); } static int call_get_edid(struct v4l2_subdev *sd, struct v4l2_subdev_edid *edid) { return check_edid(sd, edid) ? : sd->ops->pad->get_edid(sd, edid); } static int call_set_edid(struct v4l2_subdev *sd, struct v4l2_subdev_edid *edid) { return check_edid(sd, edid) ? : sd->ops->pad->set_edid(sd, edid); } static int call_s_dv_timings(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_dv_timings *timings) { if (!timings) return -EINVAL; return check_pad(sd, pad) ? : sd->ops->pad->s_dv_timings(sd, pad, timings); } static int call_g_dv_timings(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_dv_timings *timings) { if (!timings) return -EINVAL; return check_pad(sd, pad) ? : sd->ops->pad->g_dv_timings(sd, pad, timings); } static int call_query_dv_timings(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_dv_timings *timings) { if (!timings) return -EINVAL; return check_pad(sd, pad) ? : sd->ops->pad->query_dv_timings(sd, pad, timings); } static int call_dv_timings_cap(struct v4l2_subdev *sd, struct v4l2_dv_timings_cap *cap) { if (!cap) return -EINVAL; return check_pad(sd, cap->pad) ? : sd->ops->pad->dv_timings_cap(sd, cap); } static int call_enum_dv_timings(struct v4l2_subdev *sd, struct v4l2_enum_dv_timings *dvt) { if (!dvt) return -EINVAL; return check_pad(sd, dvt->pad) ? : sd->ops->pad->enum_dv_timings(sd, dvt); } static int call_get_mbus_config(struct v4l2_subdev *sd, unsigned int pad, struct v4l2_mbus_config *config) { return check_pad(sd, pad) ? : sd->ops->pad->get_mbus_config(sd, pad, config); } static int call_s_stream(struct v4l2_subdev *sd, int enable) { int ret; /* * The .s_stream() operation must never be called to start or stop an * already started or stopped subdev. Catch offenders but don't return * an error yet to avoid regressions. */ if (WARN_ON(sd->s_stream_enabled == !!enable)) return 0; ret = sd->ops->video->s_stream(sd, enable); if (!enable && ret < 0) { dev_warn(sd->dev, "disabling streaming failed (%d)\n", ret); ret = 0; } if (!ret) { sd->s_stream_enabled = enable; if (enable) v4l2_subdev_enable_privacy_led(sd); else v4l2_subdev_disable_privacy_led(sd); } return ret; } #ifdef CONFIG_MEDIA_CONTROLLER /* * Create state-management wrapper for pad ops dealing with subdev state. The * wrapper handles the case where the caller does not provide the called * subdev's state. This should be removed when all the callers are fixed. */ #define DEFINE_STATE_WRAPPER(f, arg_type) \ static int call_##f##_state(struct v4l2_subdev *sd, \ struct v4l2_subdev_state *_state, \ arg_type *arg) \ { \ struct v4l2_subdev_state *state = _state; \ int ret; \ if (!_state) \ state = v4l2_subdev_lock_and_get_active_state(sd); \ ret = call_##f(sd, state, arg); \ if (!_state && state) \ v4l2_subdev_unlock_state(state); \ return ret; \ } #else /* CONFIG_MEDIA_CONTROLLER */ #define DEFINE_STATE_WRAPPER(f, arg_type) \ static int call_##f##_state(struct v4l2_subdev *sd, \ struct v4l2_subdev_state *state, \ arg_type *arg) \ { \ return call_##f(sd, state, arg); \ } #endif /* CONFIG_MEDIA_CONTROLLER */ DEFINE_STATE_WRAPPER(get_fmt, struct v4l2_subdev_format); DEFINE_STATE_WRAPPER(set_fmt, struct v4l2_subdev_format); DEFINE_STATE_WRAPPER(enum_mbus_code, struct v4l2_subdev_mbus_code_enum); DEFINE_STATE_WRAPPER(enum_frame_size, struct v4l2_subdev_frame_size_enum); DEFINE_STATE_WRAPPER(enum_frame_interval, struct v4l2_subdev_frame_interval_enum); DEFINE_STATE_WRAPPER(get_selection, struct v4l2_subdev_selection); DEFINE_STATE_WRAPPER(set_selection, struct v4l2_subdev_selection); static const struct v4l2_subdev_pad_ops v4l2_subdev_call_pad_wrappers = { .get_fmt = call_get_fmt_state, .set_fmt = call_set_fmt_state, .enum_mbus_code = call_enum_mbus_code_state, .enum_frame_size = call_enum_frame_size_state, .enum_frame_interval = call_enum_frame_interval_state, .get_selection = call_get_selection_state, .set_selection = call_set_selection_state, .get_frame_interval = call_get_frame_interval, .set_frame_interval = call_set_frame_interval, .get_edid = call_get_edid, .set_edid = call_set_edid, .s_dv_timings = call_s_dv_timings, .g_dv_timings = call_g_dv_timings, .query_dv_timings = call_query_dv_timings, .dv_timings_cap = call_dv_timings_cap, .enum_dv_timings = call_enum_dv_timings, .get_frame_desc = call_get_frame_desc, .get_mbus_config = call_get_mbus_config, }; static const struct v4l2_subdev_video_ops v4l2_subdev_call_video_wrappers = { .s_stream = call_s_stream, }; const struct v4l2_subdev_ops v4l2_subdev_call_wrappers = { .pad = &v4l2_subdev_call_pad_wrappers, .video = &v4l2_subdev_call_video_wrappers, }; EXPORT_SYMBOL(v4l2_subdev_call_wrappers); #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) static struct v4l2_subdev_state * subdev_ioctl_get_state(struct v4l2_subdev *sd, struct v4l2_subdev_fh *subdev_fh, unsigned int cmd, void *arg) { u32 which; switch (cmd) { default: return NULL; case VIDIOC_SUBDEV_G_FMT: case VIDIOC_SUBDEV_S_FMT: which = ((struct v4l2_subdev_format *)arg)->which; break; case VIDIOC_SUBDEV_G_CROP: case VIDIOC_SUBDEV_S_CROP: which = ((struct v4l2_subdev_crop *)arg)->which; break; case VIDIOC_SUBDEV_ENUM_MBUS_CODE: which = ((struct v4l2_subdev_mbus_code_enum *)arg)->which; break; case VIDIOC_SUBDEV_ENUM_FRAME_SIZE: which = ((struct v4l2_subdev_frame_size_enum *)arg)->which; break; case VIDIOC_SUBDEV_ENUM_FRAME_INTERVAL: which = ((struct v4l2_subdev_frame_interval_enum *)arg)->which; break; case VIDIOC_SUBDEV_G_SELECTION: case VIDIOC_SUBDEV_S_SELECTION: which = ((struct v4l2_subdev_selection *)arg)->which; break; case VIDIOC_SUBDEV_G_FRAME_INTERVAL: case VIDIOC_SUBDEV_S_FRAME_INTERVAL: { struct v4l2_subdev_frame_interval *fi = arg; if (!(subdev_fh->client_caps & V4L2_SUBDEV_CLIENT_CAP_INTERVAL_USES_WHICH)) fi->which = V4L2_SUBDEV_FORMAT_ACTIVE; which = fi->which; break; } case VIDIOC_SUBDEV_G_ROUTING: case VIDIOC_SUBDEV_S_ROUTING: which = ((struct v4l2_subdev_routing *)arg)->which; break; } return which == V4L2_SUBDEV_FORMAT_TRY ? subdev_fh->state : v4l2_subdev_get_unlocked_active_state(sd); } static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg, struct v4l2_subdev_state *state) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_fh *vfh = file->private_data; struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); bool ro_subdev = test_bit(V4L2_FL_SUBDEV_RO_DEVNODE, &vdev->flags); bool streams_subdev = sd->flags & V4L2_SUBDEV_FL_STREAMS; bool client_supports_streams = subdev_fh->client_caps & V4L2_SUBDEV_CLIENT_CAP_STREAMS; int rval; /* * If the streams API is not enabled, remove V4L2_SUBDEV_CAP_STREAMS. * Remove this when the API is no longer experimental. */ if (!v4l2_subdev_enable_streams_api) streams_subdev = false; switch (cmd) { case VIDIOC_SUBDEV_QUERYCAP: { struct v4l2_subdev_capability *cap = arg; memset(cap->reserved, 0, sizeof(cap->reserved)); cap->version = LINUX_VERSION_CODE; cap->capabilities = (ro_subdev ? V4L2_SUBDEV_CAP_RO_SUBDEV : 0) | (streams_subdev ? V4L2_SUBDEV_CAP_STREAMS : 0); return 0; } case VIDIOC_QUERYCTRL: /* * TODO: this really should be folded into v4l2_queryctrl (this * currently returns -EINVAL for NULL control handlers). * However, v4l2_queryctrl() is still called directly by * drivers as well and until that has been addressed I believe * it is safer to do the check here. The same is true for the * other control ioctls below. */ if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_queryctrl(vfh->ctrl_handler, arg); case VIDIOC_QUERY_EXT_CTRL: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_query_ext_ctrl(vfh->ctrl_handler, arg); case VIDIOC_QUERYMENU: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_querymenu(vfh->ctrl_handler, arg); case VIDIOC_G_CTRL: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_g_ctrl(vfh->ctrl_handler, arg); case VIDIOC_S_CTRL: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_s_ctrl(vfh, vfh->ctrl_handler, arg); case VIDIOC_G_EXT_CTRLS: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_g_ext_ctrls(vfh->ctrl_handler, vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_S_EXT_CTRLS: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_s_ext_ctrls(vfh, vfh->ctrl_handler, vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_TRY_EXT_CTRLS: if (!vfh->ctrl_handler) return -ENOTTY; return v4l2_try_ext_ctrls(vfh->ctrl_handler, vdev, sd->v4l2_dev->mdev, arg); case VIDIOC_DQEVENT: if (!(sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS)) return -ENOIOCTLCMD; return v4l2_event_dequeue(vfh, arg, file->f_flags & O_NONBLOCK); case VIDIOC_SUBSCRIBE_EVENT: if (v4l2_subdev_has_op(sd, core, subscribe_event)) return v4l2_subdev_call(sd, core, subscribe_event, vfh, arg); if ((sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS) && vfh->ctrl_handler) return v4l2_ctrl_subdev_subscribe_event(sd, vfh, arg); return -ENOIOCTLCMD; case VIDIOC_UNSUBSCRIBE_EVENT: if (v4l2_subdev_has_op(sd, core, unsubscribe_event)) return v4l2_subdev_call(sd, core, unsubscribe_event, vfh, arg); if (sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS) return v4l2_event_subdev_unsubscribe(sd, vfh, arg); return -ENOIOCTLCMD; #ifdef CONFIG_VIDEO_ADV_DEBUG case VIDIOC_DBG_G_REGISTER: { struct v4l2_dbg_register *p = arg; if (!capable(CAP_SYS_ADMIN)) return -EPERM; return v4l2_subdev_call(sd, core, g_register, p); } case VIDIOC_DBG_S_REGISTER: { struct v4l2_dbg_register *p = arg; if (!capable(CAP_SYS_ADMIN)) return -EPERM; return v4l2_subdev_call(sd, core, s_register, p); } case VIDIOC_DBG_G_CHIP_INFO: { struct v4l2_dbg_chip_info *p = arg; if (p->match.type != V4L2_CHIP_MATCH_SUBDEV || p->match.addr) return -EINVAL; if (sd->ops->core && sd->ops->core->s_register) p->flags |= V4L2_CHIP_FL_WRITABLE; if (sd->ops->core && sd->ops->core->g_register) p->flags |= V4L2_CHIP_FL_READABLE; strscpy(p->name, sd->name, sizeof(p->name)); return 0; } #endif case VIDIOC_LOG_STATUS: { int ret; pr_info("%s: ================= START STATUS =================\n", sd->name); ret = v4l2_subdev_call(sd, core, log_status); pr_info("%s: ================== END STATUS ==================\n", sd->name); return ret; } case VIDIOC_SUBDEV_G_FMT: { struct v4l2_subdev_format *format = arg; if (!client_supports_streams) format->stream = 0; memset(format->reserved, 0, sizeof(format->reserved)); memset(format->format.reserved, 0, sizeof(format->format.reserved)); return v4l2_subdev_call(sd, pad, get_fmt, state, format); } case VIDIOC_SUBDEV_S_FMT: { struct v4l2_subdev_format *format = arg; if (format->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; if (!client_supports_streams) format->stream = 0; memset(format->reserved, 0, sizeof(format->reserved)); memset(format->format.reserved, 0, sizeof(format->format.reserved)); return v4l2_subdev_call(sd, pad, set_fmt, state, format); } case VIDIOC_SUBDEV_G_CROP: { struct v4l2_subdev_crop *crop = arg; struct v4l2_subdev_selection sel; if (!client_supports_streams) crop->stream = 0; memset(crop->reserved, 0, sizeof(crop->reserved)); memset(&sel, 0, sizeof(sel)); sel.which = crop->which; sel.pad = crop->pad; sel.stream = crop->stream; sel.target = V4L2_SEL_TGT_CROP; rval = v4l2_subdev_call( sd, pad, get_selection, state, &sel); crop->rect = sel.r; return rval; } case VIDIOC_SUBDEV_S_CROP: { struct v4l2_subdev_crop *crop = arg; struct v4l2_subdev_selection sel; if (crop->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; if (!client_supports_streams) crop->stream = 0; memset(crop->reserved, 0, sizeof(crop->reserved)); memset(&sel, 0, sizeof(sel)); sel.which = crop->which; sel.pad = crop->pad; sel.stream = crop->stream; sel.target = V4L2_SEL_TGT_CROP; sel.r = crop->rect; rval = v4l2_subdev_call( sd, pad, set_selection, state, &sel); crop->rect = sel.r; return rval; } case VIDIOC_SUBDEV_ENUM_MBUS_CODE: { struct v4l2_subdev_mbus_code_enum *code = arg; if (!client_supports_streams) code->stream = 0; memset(code->reserved, 0, sizeof(code->reserved)); return v4l2_subdev_call(sd, pad, enum_mbus_code, state, code); } case VIDIOC_SUBDEV_ENUM_FRAME_SIZE: { struct v4l2_subdev_frame_size_enum *fse = arg; if (!client_supports_streams) fse->stream = 0; memset(fse->reserved, 0, sizeof(fse->reserved)); return v4l2_subdev_call(sd, pad, enum_frame_size, state, fse); } case VIDIOC_SUBDEV_G_FRAME_INTERVAL: { struct v4l2_subdev_frame_interval *fi = arg; if (!client_supports_streams) fi->stream = 0; memset(fi->reserved, 0, sizeof(fi->reserved)); return v4l2_subdev_call(sd, pad, get_frame_interval, state, fi); } case VIDIOC_SUBDEV_S_FRAME_INTERVAL: { struct v4l2_subdev_frame_interval *fi = arg; if (!client_supports_streams) fi->stream = 0; if (fi->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; memset(fi->reserved, 0, sizeof(fi->reserved)); return v4l2_subdev_call(sd, pad, set_frame_interval, state, fi); } case VIDIOC_SUBDEV_ENUM_FRAME_INTERVAL: { struct v4l2_subdev_frame_interval_enum *fie = arg; if (!client_supports_streams) fie->stream = 0; memset(fie->reserved, 0, sizeof(fie->reserved)); return v4l2_subdev_call(sd, pad, enum_frame_interval, state, fie); } case VIDIOC_SUBDEV_G_SELECTION: { struct v4l2_subdev_selection *sel = arg; if (!client_supports_streams) sel->stream = 0; memset(sel->reserved, 0, sizeof(sel->reserved)); return v4l2_subdev_call( sd, pad, get_selection, state, sel); } case VIDIOC_SUBDEV_S_SELECTION: { struct v4l2_subdev_selection *sel = arg; if (sel->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; if (!client_supports_streams) sel->stream = 0; memset(sel->reserved, 0, sizeof(sel->reserved)); return v4l2_subdev_call( sd, pad, set_selection, state, sel); } case VIDIOC_G_EDID: { struct v4l2_subdev_edid *edid = arg; return v4l2_subdev_call(sd, pad, get_edid, edid); } case VIDIOC_S_EDID: { struct v4l2_subdev_edid *edid = arg; return v4l2_subdev_call(sd, pad, set_edid, edid); } case VIDIOC_SUBDEV_DV_TIMINGS_CAP: { struct v4l2_dv_timings_cap *cap = arg; return v4l2_subdev_call(sd, pad, dv_timings_cap, cap); } case VIDIOC_SUBDEV_ENUM_DV_TIMINGS: { struct v4l2_enum_dv_timings *dvt = arg; return v4l2_subdev_call(sd, pad, enum_dv_timings, dvt); } case VIDIOC_SUBDEV_QUERY_DV_TIMINGS: return v4l2_subdev_call(sd, pad, query_dv_timings, 0, arg); case VIDIOC_SUBDEV_G_DV_TIMINGS: return v4l2_subdev_call(sd, pad, g_dv_timings, 0, arg); case VIDIOC_SUBDEV_S_DV_TIMINGS: if (ro_subdev) return -EPERM; return v4l2_subdev_call(sd, pad, s_dv_timings, 0, arg); case VIDIOC_SUBDEV_G_STD: return v4l2_subdev_call(sd, video, g_std, arg); case VIDIOC_SUBDEV_S_STD: { v4l2_std_id *std = arg; if (ro_subdev) return -EPERM; return v4l2_subdev_call(sd, video, s_std, *std); } case VIDIOC_SUBDEV_ENUMSTD: { struct v4l2_standard *p = arg; v4l2_std_id id; if (v4l2_subdev_call(sd, video, g_tvnorms, &id)) return -EINVAL; return v4l_video_std_enumstd(p, id); } case VIDIOC_SUBDEV_QUERYSTD: return v4l2_subdev_call(sd, video, querystd, arg); case VIDIOC_SUBDEV_G_ROUTING: { struct v4l2_subdev_routing *routing = arg; struct v4l2_subdev_krouting *krouting; if (!v4l2_subdev_enable_streams_api) return -ENOIOCTLCMD; if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) return -ENOIOCTLCMD; memset(routing->reserved, 0, sizeof(routing->reserved)); krouting = &state->routing; memcpy((struct v4l2_subdev_route *)(uintptr_t)routing->routes, krouting->routes, min(krouting->num_routes, routing->len_routes) * sizeof(*krouting->routes)); routing->num_routes = krouting->num_routes; return 0; } case VIDIOC_SUBDEV_S_ROUTING: { struct v4l2_subdev_routing *routing = arg; struct v4l2_subdev_route *routes = (struct v4l2_subdev_route *)(uintptr_t)routing->routes; struct v4l2_subdev_krouting krouting = {}; unsigned int i; if (!v4l2_subdev_enable_streams_api) return -ENOIOCTLCMD; if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) return -ENOIOCTLCMD; if (routing->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev) return -EPERM; if (routing->num_routes > routing->len_routes) return -EINVAL; memset(routing->reserved, 0, sizeof(routing->reserved)); for (i = 0; i < routing->num_routes; ++i) { const struct v4l2_subdev_route *route = &routes[i]; const struct media_pad *pads = sd->entity.pads; if (route->sink_stream > V4L2_SUBDEV_MAX_STREAM_ID || route->source_stream > V4L2_SUBDEV_MAX_STREAM_ID) return -EINVAL; if (route->sink_pad >= sd->entity.num_pads) return -EINVAL; if (!(pads[route->sink_pad].flags & MEDIA_PAD_FL_SINK)) return -EINVAL; if (route->source_pad >= sd->entity.num_pads) return -EINVAL; if (!(pads[route->source_pad].flags & MEDIA_PAD_FL_SOURCE)) return -EINVAL; } /* * If the driver doesn't support setting routing, just return * the routing table. */ if (!v4l2_subdev_has_op(sd, pad, set_routing)) { memcpy((struct v4l2_subdev_route *)(uintptr_t)routing->routes, state->routing.routes, min(state->routing.num_routes, routing->len_routes) * sizeof(*state->routing.routes)); routing->num_routes = state->routing.num_routes; return 0; } krouting.num_routes = routing->num_routes; krouting.len_routes = routing->len_routes; krouting.routes = routes; rval = v4l2_subdev_call(sd, pad, set_routing, state, routing->which, &krouting); if (rval < 0) return rval; memcpy((struct v4l2_subdev_route *)(uintptr_t)routing->routes, state->routing.routes, min(state->routing.num_routes, routing->len_routes) * sizeof(*state->routing.routes)); routing->num_routes = state->routing.num_routes; return 0; } case VIDIOC_SUBDEV_G_CLIENT_CAP: { struct v4l2_subdev_client_capability *client_cap = arg; client_cap->capabilities = subdev_fh->client_caps; return 0; } case VIDIOC_SUBDEV_S_CLIENT_CAP: { struct v4l2_subdev_client_capability *client_cap = arg; /* * Clear V4L2_SUBDEV_CLIENT_CAP_STREAMS if streams API is not * enabled. Remove this when streams API is no longer * experimental. */ if (!v4l2_subdev_enable_streams_api) client_cap->capabilities &= ~V4L2_SUBDEV_CLIENT_CAP_STREAMS; /* Filter out unsupported capabilities */ client_cap->capabilities &= (V4L2_SUBDEV_CLIENT_CAP_STREAMS | V4L2_SUBDEV_CLIENT_CAP_INTERVAL_USES_WHICH); subdev_fh->client_caps = client_cap->capabilities; return 0; } default: return v4l2_subdev_call(sd, core, ioctl, cmd, arg); } return 0; } static long subdev_do_ioctl_lock(struct file *file, unsigned int cmd, void *arg) { struct video_device *vdev = video_devdata(file); struct mutex *lock = vdev->lock; long ret = -ENODEV; if (lock && mutex_lock_interruptible(lock)) return -ERESTARTSYS; if (video_is_registered(vdev)) { struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_fh *vfh = file->private_data; struct v4l2_subdev_fh *subdev_fh = to_v4l2_subdev_fh(vfh); struct v4l2_subdev_state *state; state = subdev_ioctl_get_state(sd, subdev_fh, cmd, arg); if (state) v4l2_subdev_lock_state(state); ret = subdev_do_ioctl(file, cmd, arg, state); if (state) v4l2_subdev_unlock_state(state); } if (lock) mutex_unlock(lock); return ret; } static long subdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return video_usercopy(file, cmd, arg, subdev_do_ioctl_lock); } #ifdef CONFIG_COMPAT static long subdev_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); return v4l2_subdev_call(sd, core, compat_ioctl32, cmd, arg); } #endif #else /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static long subdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return -ENODEV; } #ifdef CONFIG_COMPAT static long subdev_compat_ioctl32(struct file *file, unsigned int cmd, unsigned long arg) { return -ENODEV; } #endif #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static __poll_t subdev_poll(struct file *file, poll_table *wait) { struct video_device *vdev = video_devdata(file); struct v4l2_subdev *sd = vdev_to_v4l2_subdev(vdev); struct v4l2_fh *fh = file->private_data; if (!(sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS)) return EPOLLERR; poll_wait(file, &fh->wait, wait); if (v4l2_event_pending(fh)) return EPOLLPRI; return 0; } const struct v4l2_file_operations v4l2_subdev_fops = { .owner = THIS_MODULE, .open = subdev_open, .unlocked_ioctl = subdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl32 = subdev_compat_ioctl32, #endif .release = subdev_close, .poll = subdev_poll, }; #ifdef CONFIG_MEDIA_CONTROLLER int v4l2_subdev_get_fwnode_pad_1_to_1(struct media_entity *entity, struct fwnode_endpoint *endpoint) { struct fwnode_handle *fwnode; struct v4l2_subdev *sd; if (!is_media_entity_v4l2_subdev(entity)) return -EINVAL; sd = media_entity_to_v4l2_subdev(entity); fwnode = fwnode_graph_get_port_parent(endpoint->local_fwnode); fwnode_handle_put(fwnode); if (device_match_fwnode(sd->dev, fwnode)) return endpoint->port; return -ENXIO; } EXPORT_SYMBOL_GPL(v4l2_subdev_get_fwnode_pad_1_to_1); int v4l2_subdev_link_validate_default(struct v4l2_subdev *sd, struct media_link *link, struct v4l2_subdev_format *source_fmt, struct v4l2_subdev_format *sink_fmt) { bool pass = true; /* The width, height and code must match. */ if (source_fmt->format.width != sink_fmt->format.width) { dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: width does not match (source %u, sink %u)\n", __func__, source_fmt->format.width, sink_fmt->format.width); pass = false; } if (source_fmt->format.height != sink_fmt->format.height) { dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: height does not match (source %u, sink %u)\n", __func__, source_fmt->format.height, sink_fmt->format.height); pass = false; } if (source_fmt->format.code != sink_fmt->format.code) { dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: media bus code does not match (source 0x%8.8x, sink 0x%8.8x)\n", __func__, source_fmt->format.code, sink_fmt->format.code); pass = false; } /* The field order must match, or the sink field order must be NONE * to support interlaced hardware connected to bridges that support * progressive formats only. */ if (source_fmt->format.field != sink_fmt->format.field && sink_fmt->format.field != V4L2_FIELD_NONE) { dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: field does not match (source %u, sink %u)\n", __func__, source_fmt->format.field, sink_fmt->format.field); pass = false; } if (pass) return 0; dev_dbg(sd->entity.graph_obj.mdev->dev, "%s: link was \"%s\":%u -> \"%s\":%u\n", __func__, link->source->entity->name, link->source->index, link->sink->entity->name, link->sink->index); return -EPIPE; } EXPORT_SYMBOL_GPL(v4l2_subdev_link_validate_default); static int v4l2_subdev_link_validate_get_format(struct media_pad *pad, u32 stream, struct v4l2_subdev_format *fmt, bool states_locked) { struct v4l2_subdev_state *state; struct v4l2_subdev *sd; int ret; sd = media_entity_to_v4l2_subdev(pad->entity); fmt->which = V4L2_SUBDEV_FORMAT_ACTIVE; fmt->pad = pad->index; fmt->stream = stream; if (states_locked) state = v4l2_subdev_get_locked_active_state(sd); else state = v4l2_subdev_lock_and_get_active_state(sd); ret = v4l2_subdev_call(sd, pad, get_fmt, state, fmt); if (!states_locked && state) v4l2_subdev_unlock_state(state); return ret; } #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) static void __v4l2_link_validate_get_streams(struct media_pad *pad, u64 *streams_mask, bool states_locked) { struct v4l2_subdev_route *route; struct v4l2_subdev_state *state; struct v4l2_subdev *subdev; subdev = media_entity_to_v4l2_subdev(pad->entity); *streams_mask = 0; if (states_locked) state = v4l2_subdev_get_locked_active_state(subdev); else state = v4l2_subdev_lock_and_get_active_state(subdev); if (WARN_ON(!state)) return; for_each_active_route(&state->routing, route) { u32 route_pad; u32 route_stream; if (pad->flags & MEDIA_PAD_FL_SOURCE) { route_pad = route->source_pad; route_stream = route->source_stream; } else { route_pad = route->sink_pad; route_stream = route->sink_stream; } if (route_pad != pad->index) continue; *streams_mask |= BIT_ULL(route_stream); } if (!states_locked) v4l2_subdev_unlock_state(state); } #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ static void v4l2_link_validate_get_streams(struct media_pad *pad, u64 *streams_mask, bool states_locked) { struct v4l2_subdev *subdev = media_entity_to_v4l2_subdev(pad->entity); if (!(subdev->flags & V4L2_SUBDEV_FL_STREAMS)) { /* Non-streams subdevs have an implicit stream 0 */ *streams_mask = BIT_ULL(0); return; } #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) __v4l2_link_validate_get_streams(pad, streams_mask, states_locked); #else /* This shouldn't happen */ *streams_mask = 0; #endif } static int v4l2_subdev_link_validate_locked(struct media_link *link, bool states_locked) { struct v4l2_subdev *sink_subdev = media_entity_to_v4l2_subdev(link->sink->entity); struct device *dev = sink_subdev->entity.graph_obj.mdev->dev; u64 source_streams_mask; u64 sink_streams_mask; u64 dangling_sink_streams; u32 stream; int ret; dev_dbg(dev, "validating link \"%s\":%u -> \"%s\":%u\n", link->source->entity->name, link->source->index, link->sink->entity->name, link->sink->index); v4l2_link_validate_get_streams(link->source, &source_streams_mask, states_locked); v4l2_link_validate_get_streams(link->sink, &sink_streams_mask, states_locked); /* * It is ok to have more source streams than sink streams as extra * source streams can just be ignored by the receiver, but having extra * sink streams is an error as streams must have a source. */ dangling_sink_streams = (source_streams_mask ^ sink_streams_mask) & sink_streams_mask; if (dangling_sink_streams) { dev_err(dev, "Dangling sink streams: mask %#llx\n", dangling_sink_streams); return -EINVAL; } /* Validate source and sink stream formats */ for (stream = 0; stream < sizeof(sink_streams_mask) * 8; ++stream) { struct v4l2_subdev_format sink_fmt, source_fmt; if (!(sink_streams_mask & BIT_ULL(stream))) continue; dev_dbg(dev, "validating stream \"%s\":%u:%u -> \"%s\":%u:%u\n", link->source->entity->name, link->source->index, stream, link->sink->entity->name, link->sink->index, stream); ret = v4l2_subdev_link_validate_get_format(link->source, stream, &source_fmt, states_locked); if (ret < 0) { dev_dbg(dev, "Failed to get format for \"%s\":%u:%u (but that's ok)\n", link->source->entity->name, link->source->index, stream); continue; } ret = v4l2_subdev_link_validate_get_format(link->sink, stream, &sink_fmt, states_locked); if (ret < 0) { dev_dbg(dev, "Failed to get format for \"%s\":%u:%u (but that's ok)\n", link->sink->entity->name, link->sink->index, stream); continue; } /* TODO: add stream number to link_validate() */ ret = v4l2_subdev_call(sink_subdev, pad, link_validate, link, &source_fmt, &sink_fmt); if (!ret) continue; if (ret != -ENOIOCTLCMD) return ret; ret = v4l2_subdev_link_validate_default(sink_subdev, link, &source_fmt, &sink_fmt); if (ret) return ret; } return 0; } int v4l2_subdev_link_validate(struct media_link *link) { struct v4l2_subdev *source_sd, *sink_sd; struct v4l2_subdev_state *source_state, *sink_state; bool states_locked; int ret; /* * Links are validated in the context of the sink entity. Usage of this * helper on a sink that is not a subdev is a clear driver bug. */ if (WARN_ON_ONCE(!is_media_entity_v4l2_subdev(link->sink->entity))) return -EINVAL; /* * If the source is a video device, delegate link validation to it. This * allows usage of this helper for subdev connected to a video output * device, provided that the driver implement the video output device's * .link_validate() operation. */ if (is_media_entity_v4l2_video_device(link->source->entity)) { struct media_entity *source = link->source->entity; if (!source->ops || !source->ops->link_validate) { /* * Many existing drivers do not implement the required * .link_validate() operation for their video devices. * Print a warning to get the drivers fixed, and return * 0 to avoid breaking userspace. This should * eventually be turned into a WARN_ON() when all * drivers will have been fixed. */ pr_warn_once("video device '%s' does not implement .link_validate(), driver bug!\n", source->name); return 0; } /* * Avoid infinite loops in case a video device incorrectly uses * this helper function as its .link_validate() handler. */ if (WARN_ON(source->ops->link_validate == v4l2_subdev_link_validate)) return -EINVAL; return source->ops->link_validate(link); } /* * If the source is still not a subdev, usage of this helper is a clear * driver bug. */ if (WARN_ON(!is_media_entity_v4l2_subdev(link->source->entity))) return -EINVAL; sink_sd = media_entity_to_v4l2_subdev(link->sink->entity); source_sd = media_entity_to_v4l2_subdev(link->source->entity); sink_state = v4l2_subdev_get_unlocked_active_state(sink_sd); source_state = v4l2_subdev_get_unlocked_active_state(source_sd); states_locked = sink_state && source_state; if (states_locked) v4l2_subdev_lock_states(sink_state, source_state); ret = v4l2_subdev_link_validate_locked(link, states_locked); if (states_locked) v4l2_subdev_unlock_states(sink_state, source_state); return ret; } EXPORT_SYMBOL_GPL(v4l2_subdev_link_validate); bool v4l2_subdev_has_pad_interdep(struct media_entity *entity, unsigned int pad0, unsigned int pad1) { struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity); struct v4l2_subdev_krouting *routing; struct v4l2_subdev_state *state; unsigned int i; state = v4l2_subdev_lock_and_get_active_state(sd); routing = &state->routing; for (i = 0; i < routing->num_routes; ++i) { struct v4l2_subdev_route *route = &routing->routes[i]; if (!(route->flags & V4L2_SUBDEV_ROUTE_FL_ACTIVE)) continue; if ((route->sink_pad == pad0 && route->source_pad == pad1) || (route->source_pad == pad0 && route->sink_pad == pad1)) { v4l2_subdev_unlock_state(state); return true; } } v4l2_subdev_unlock_state(state); return false; } EXPORT_SYMBOL_GPL(v4l2_subdev_has_pad_interdep); struct v4l2_subdev_state * __v4l2_subdev_state_alloc(struct v4l2_subdev *sd, const char *lock_name, struct lock_class_key *lock_key) { struct v4l2_subdev_state *state; int ret; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return ERR_PTR(-ENOMEM); __mutex_init(&state->_lock, lock_name, lock_key); if (sd->state_lock) state->lock = sd->state_lock; else state->lock = &state->_lock; state->sd = sd; /* Drivers that support streams do not need the legacy pad config */ if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS) && sd->entity.num_pads) { state->pads = kvcalloc(sd->entity.num_pads, sizeof(*state->pads), GFP_KERNEL); if (!state->pads) { ret = -ENOMEM; goto err; } } if (sd->internal_ops && sd->internal_ops->init_state) { /* * There can be no race at this point, but we lock the state * anyway to satisfy lockdep checks. */ v4l2_subdev_lock_state(state); ret = sd->internal_ops->init_state(sd, state); v4l2_subdev_unlock_state(state); if (ret) goto err; } return state; err: if (state && state->pads) kvfree(state->pads); kfree(state); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_alloc); void __v4l2_subdev_state_free(struct v4l2_subdev_state *state) { if (!state) return; mutex_destroy(&state->_lock); kfree(state->routing.routes); kvfree(state->stream_configs.configs); kvfree(state->pads); kfree(state); } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_free); int __v4l2_subdev_init_finalize(struct v4l2_subdev *sd, const char *name, struct lock_class_key *key) { struct v4l2_subdev_state *state; struct device *dev = sd->dev; bool has_disable_streams; bool has_enable_streams; bool has_s_stream; /* Check that the subdevice implements the required features */ has_s_stream = v4l2_subdev_has_op(sd, video, s_stream); has_enable_streams = v4l2_subdev_has_op(sd, pad, enable_streams); has_disable_streams = v4l2_subdev_has_op(sd, pad, disable_streams); if (has_enable_streams != has_disable_streams) { dev_err(dev, "subdev '%s' must implement both or neither of .enable_streams() and .disable_streams()\n", sd->name); return -EINVAL; } if (sd->flags & V4L2_SUBDEV_FL_STREAMS) { if (has_s_stream && !has_enable_streams) { dev_err(dev, "subdev '%s' must implement .enable/disable_streams()\n", sd->name); return -EINVAL; } } if (sd->ctrl_handler) sd->flags |= V4L2_SUBDEV_FL_HAS_EVENTS; state = __v4l2_subdev_state_alloc(sd, name, key); if (IS_ERR(state)) return PTR_ERR(state); sd->active_state = state; return 0; } EXPORT_SYMBOL_GPL(__v4l2_subdev_init_finalize); void v4l2_subdev_cleanup(struct v4l2_subdev *sd) { struct v4l2_async_subdev_endpoint *ase, *ase_tmp; __v4l2_subdev_state_free(sd->active_state); sd->active_state = NULL; /* Uninitialised sub-device, bail out here. */ if (!sd->async_subdev_endpoint_list.next) return; list_for_each_entry_safe(ase, ase_tmp, &sd->async_subdev_endpoint_list, async_subdev_endpoint_entry) { list_del(&ase->async_subdev_endpoint_entry); kfree(ase); } } EXPORT_SYMBOL_GPL(v4l2_subdev_cleanup); struct v4l2_mbus_framefmt * __v4l2_subdev_state_get_format(struct v4l2_subdev_state *state, unsigned int pad, u32 stream) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; if (WARN_ON_ONCE(!state)) return NULL; if (state->pads) { if (stream) return NULL; if (pad >= state->sd->entity.num_pads) return NULL; return &state->pads[pad].format; } lockdep_assert_held(state->lock); stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) { if (stream_configs->configs[i].pad == pad && stream_configs->configs[i].stream == stream) return &stream_configs->configs[i].fmt; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_get_format); struct v4l2_rect * __v4l2_subdev_state_get_crop(struct v4l2_subdev_state *state, unsigned int pad, u32 stream) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; if (WARN_ON_ONCE(!state)) return NULL; if (state->pads) { if (stream) return NULL; if (pad >= state->sd->entity.num_pads) return NULL; return &state->pads[pad].crop; } lockdep_assert_held(state->lock); stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) { if (stream_configs->configs[i].pad == pad && stream_configs->configs[i].stream == stream) return &stream_configs->configs[i].crop; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_get_crop); struct v4l2_rect * __v4l2_subdev_state_get_compose(struct v4l2_subdev_state *state, unsigned int pad, u32 stream) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; if (WARN_ON_ONCE(!state)) return NULL; if (state->pads) { if (stream) return NULL; if (pad >= state->sd->entity.num_pads) return NULL; return &state->pads[pad].compose; } lockdep_assert_held(state->lock); stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) { if (stream_configs->configs[i].pad == pad && stream_configs->configs[i].stream == stream) return &stream_configs->configs[i].compose; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_get_compose); struct v4l2_fract * __v4l2_subdev_state_get_interval(struct v4l2_subdev_state *state, unsigned int pad, u32 stream) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; if (WARN_ON(!state)) return NULL; lockdep_assert_held(state->lock); if (state->pads) { if (stream) return NULL; if (pad >= state->sd->entity.num_pads) return NULL; return &state->pads[pad].interval; } lockdep_assert_held(state->lock); stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) { if (stream_configs->configs[i].pad == pad && stream_configs->configs[i].stream == stream) return &stream_configs->configs[i].interval; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_state_get_interval); #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) static int v4l2_subdev_init_stream_configs(struct v4l2_subdev_stream_configs *stream_configs, const struct v4l2_subdev_krouting *routing) { struct v4l2_subdev_stream_configs new_configs = { 0 }; struct v4l2_subdev_route *route; u32 idx; /* Count number of formats needed */ for_each_active_route(routing, route) { /* * Each route needs a format on both ends of the route. */ new_configs.num_configs += 2; } if (new_configs.num_configs) { new_configs.configs = kvcalloc(new_configs.num_configs, sizeof(*new_configs.configs), GFP_KERNEL); if (!new_configs.configs) return -ENOMEM; } /* * Fill in the 'pad' and stream' value for each item in the array from * the routing table */ idx = 0; for_each_active_route(routing, route) { new_configs.configs[idx].pad = route->sink_pad; new_configs.configs[idx].stream = route->sink_stream; idx++; new_configs.configs[idx].pad = route->source_pad; new_configs.configs[idx].stream = route->source_stream; idx++; } kvfree(stream_configs->configs); *stream_configs = new_configs; return 0; } int v4l2_subdev_get_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_format *format) { struct v4l2_mbus_framefmt *fmt; fmt = v4l2_subdev_state_get_format(state, format->pad, format->stream); if (!fmt) return -EINVAL; format->format = *fmt; return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_get_fmt); int v4l2_subdev_get_frame_interval(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, struct v4l2_subdev_frame_interval *fi) { struct v4l2_fract *interval; interval = v4l2_subdev_state_get_interval(state, fi->pad, fi->stream); if (!interval) return -EINVAL; fi->interval = *interval; return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_get_frame_interval); int v4l2_subdev_set_routing(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, const struct v4l2_subdev_krouting *routing) { struct v4l2_subdev_krouting *dst = &state->routing; const struct v4l2_subdev_krouting *src = routing; struct v4l2_subdev_krouting new_routing = { 0 }; size_t bytes; int r; if (unlikely(check_mul_overflow((size_t)src->num_routes, sizeof(*src->routes), &bytes))) return -EOVERFLOW; lockdep_assert_held(state->lock); if (src->num_routes > 0) { new_routing.routes = kmemdup(src->routes, bytes, GFP_KERNEL); if (!new_routing.routes) return -ENOMEM; } new_routing.num_routes = src->num_routes; r = v4l2_subdev_init_stream_configs(&state->stream_configs, &new_routing); if (r) { kfree(new_routing.routes); return r; } kfree(dst->routes); *dst = new_routing; return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_set_routing); struct v4l2_subdev_route * __v4l2_subdev_next_active_route(const struct v4l2_subdev_krouting *routing, struct v4l2_subdev_route *route) { if (route) ++route; else route = &routing->routes[0]; for (; route < routing->routes + routing->num_routes; ++route) { if (!(route->flags & V4L2_SUBDEV_ROUTE_FL_ACTIVE)) continue; return route; } return NULL; } EXPORT_SYMBOL_GPL(__v4l2_subdev_next_active_route); int v4l2_subdev_set_routing_with_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, const struct v4l2_subdev_krouting *routing, const struct v4l2_mbus_framefmt *fmt) { struct v4l2_subdev_stream_configs *stream_configs; unsigned int i; int ret; ret = v4l2_subdev_set_routing(sd, state, routing); if (ret) return ret; stream_configs = &state->stream_configs; for (i = 0; i < stream_configs->num_configs; ++i) stream_configs->configs[i].fmt = *fmt; return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_set_routing_with_fmt); int v4l2_subdev_routing_find_opposite_end(const struct v4l2_subdev_krouting *routing, u32 pad, u32 stream, u32 *other_pad, u32 *other_stream) { unsigned int i; for (i = 0; i < routing->num_routes; ++i) { struct v4l2_subdev_route *route = &routing->routes[i]; if (route->source_pad == pad && route->source_stream == stream) { if (other_pad) *other_pad = route->sink_pad; if (other_stream) *other_stream = route->sink_stream; return 0; } if (route->sink_pad == pad && route->sink_stream == stream) { if (other_pad) *other_pad = route->source_pad; if (other_stream) *other_stream = route->source_stream; return 0; } } return -EINVAL; } EXPORT_SYMBOL_GPL(v4l2_subdev_routing_find_opposite_end); struct v4l2_mbus_framefmt * v4l2_subdev_state_get_opposite_stream_format(struct v4l2_subdev_state *state, u32 pad, u32 stream) { u32 other_pad, other_stream; int ret; ret = v4l2_subdev_routing_find_opposite_end(&state->routing, pad, stream, &other_pad, &other_stream); if (ret) return NULL; return v4l2_subdev_state_get_format(state, other_pad, other_stream); } EXPORT_SYMBOL_GPL(v4l2_subdev_state_get_opposite_stream_format); u64 v4l2_subdev_state_xlate_streams(const struct v4l2_subdev_state *state, u32 pad0, u32 pad1, u64 *streams) { const struct v4l2_subdev_krouting *routing = &state->routing; struct v4l2_subdev_route *route; u64 streams0 = 0; u64 streams1 = 0; for_each_active_route(routing, route) { if (route->sink_pad == pad0 && route->source_pad == pad1 && (*streams & BIT_ULL(route->sink_stream))) { streams0 |= BIT_ULL(route->sink_stream); streams1 |= BIT_ULL(route->source_stream); } if (route->source_pad == pad0 && route->sink_pad == pad1 && (*streams & BIT_ULL(route->source_stream))) { streams0 |= BIT_ULL(route->source_stream); streams1 |= BIT_ULL(route->sink_stream); } } *streams = streams0; return streams1; } EXPORT_SYMBOL_GPL(v4l2_subdev_state_xlate_streams); int v4l2_subdev_routing_validate(struct v4l2_subdev *sd, const struct v4l2_subdev_krouting *routing, enum v4l2_subdev_routing_restriction disallow) { u32 *remote_pads = NULL; unsigned int i, j; int ret = -EINVAL; if (disallow & (V4L2_SUBDEV_ROUTING_NO_STREAM_MIX | V4L2_SUBDEV_ROUTING_NO_MULTIPLEXING)) { remote_pads = kcalloc(sd->entity.num_pads, sizeof(*remote_pads), GFP_KERNEL); if (!remote_pads) return -ENOMEM; for (i = 0; i < sd->entity.num_pads; ++i) remote_pads[i] = U32_MAX; } for (i = 0; i < routing->num_routes; ++i) { const struct v4l2_subdev_route *route = &routing->routes[i]; /* Validate the sink and source pad numbers. */ if (route->sink_pad >= sd->entity.num_pads || !(sd->entity.pads[route->sink_pad].flags & MEDIA_PAD_FL_SINK)) { dev_dbg(sd->dev, "route %u sink (%u) is not a sink pad\n", i, route->sink_pad); goto out; } if (route->source_pad >= sd->entity.num_pads || !(sd->entity.pads[route->source_pad].flags & MEDIA_PAD_FL_SOURCE)) { dev_dbg(sd->dev, "route %u source (%u) is not a source pad\n", i, route->source_pad); goto out; } /* * V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX: all streams from a * sink pad must be routed to a single source pad. */ if (disallow & V4L2_SUBDEV_ROUTING_NO_SINK_STREAM_MIX) { if (remote_pads[route->sink_pad] != U32_MAX && remote_pads[route->sink_pad] != route->source_pad) { dev_dbg(sd->dev, "route %u attempts to mix %s streams\n", i, "sink"); goto out; } } /* * V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX: all streams on a * source pad must originate from a single sink pad. */ if (disallow & V4L2_SUBDEV_ROUTING_NO_SOURCE_STREAM_MIX) { if (remote_pads[route->source_pad] != U32_MAX && remote_pads[route->source_pad] != route->sink_pad) { dev_dbg(sd->dev, "route %u attempts to mix %s streams\n", i, "source"); goto out; } } /* * V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING: Pads on the sink * side can not do stream multiplexing, i.e. there can be only * a single stream in a sink pad. */ if (disallow & V4L2_SUBDEV_ROUTING_NO_SINK_MULTIPLEXING) { if (remote_pads[route->sink_pad] != U32_MAX) { dev_dbg(sd->dev, "route %u attempts to multiplex on %s pad %u\n", i, "sink", route->sink_pad); goto out; } } /* * V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING: Pads on the * source side can not do stream multiplexing, i.e. there can * be only a single stream in a source pad. */ if (disallow & V4L2_SUBDEV_ROUTING_NO_SOURCE_MULTIPLEXING) { if (remote_pads[route->source_pad] != U32_MAX) { dev_dbg(sd->dev, "route %u attempts to multiplex on %s pad %u\n", i, "source", route->source_pad); goto out; } } if (remote_pads) { remote_pads[route->sink_pad] = route->source_pad; remote_pads[route->source_pad] = route->sink_pad; } for (j = i + 1; j < routing->num_routes; ++j) { const struct v4l2_subdev_route *r = &routing->routes[j]; /* * V4L2_SUBDEV_ROUTING_NO_1_TO_N: No two routes can * originate from the same (sink) stream. */ if ((disallow & V4L2_SUBDEV_ROUTING_NO_1_TO_N) && route->sink_pad == r->sink_pad && route->sink_stream == r->sink_stream) { dev_dbg(sd->dev, "routes %u and %u originate from same sink (%u/%u)\n", i, j, route->sink_pad, route->sink_stream); goto out; } /* * V4L2_SUBDEV_ROUTING_NO_N_TO_1: No two routes can end * at the same (source) stream. */ if ((disallow & V4L2_SUBDEV_ROUTING_NO_N_TO_1) && route->source_pad == r->source_pad && route->source_stream == r->source_stream) { dev_dbg(sd->dev, "routes %u and %u end at same source (%u/%u)\n", i, j, route->source_pad, route->source_stream); goto out; } } } ret = 0; out: kfree(remote_pads); return ret; } EXPORT_SYMBOL_GPL(v4l2_subdev_routing_validate); static void v4l2_subdev_collect_streams(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, u32 pad, u64 streams_mask, u64 *found_streams, u64 *enabled_streams) { if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) { *found_streams = BIT_ULL(0); *enabled_streams = (sd->enabled_pads & BIT_ULL(pad)) ? BIT_ULL(0) : 0; return; } *found_streams = 0; *enabled_streams = 0; for (unsigned int i = 0; i < state->stream_configs.num_configs; ++i) { const struct v4l2_subdev_stream_config *cfg = &state->stream_configs.configs[i]; if (cfg->pad != pad || !(streams_mask & BIT_ULL(cfg->stream))) continue; *found_streams |= BIT_ULL(cfg->stream); if (cfg->enabled) *enabled_streams |= BIT_ULL(cfg->stream); } } static void v4l2_subdev_set_streams_enabled(struct v4l2_subdev *sd, struct v4l2_subdev_state *state, u32 pad, u64 streams_mask, bool enabled) { if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) { if (enabled) sd->enabled_pads |= BIT_ULL(pad); else sd->enabled_pads &= ~BIT_ULL(pad); return; } for (unsigned int i = 0; i < state->stream_configs.num_configs; ++i) { struct v4l2_subdev_stream_config *cfg = &state->stream_configs.configs[i]; if (cfg->pad == pad && (streams_mask & BIT_ULL(cfg->stream))) cfg->enabled = enabled; } } int v4l2_subdev_enable_streams(struct v4l2_subdev *sd, u32 pad, u64 streams_mask) { struct device *dev = sd->entity.graph_obj.mdev->dev; struct v4l2_subdev_state *state; bool already_streaming; u64 enabled_streams; u64 found_streams; bool use_s_stream; int ret; /* A few basic sanity checks first. */ if (pad >= sd->entity.num_pads) return -EINVAL; if (!(sd->entity.pads[pad].flags & MEDIA_PAD_FL_SOURCE)) return -EOPNOTSUPP; /* * We use a 64-bit bitmask for tracking enabled pads, so only subdevices * with 64 pads or less can be supported. */ if (pad >= sizeof(sd->enabled_pads) * BITS_PER_BYTE) return -EOPNOTSUPP; if (!streams_mask) return 0; /* Fallback on .s_stream() if .enable_streams() isn't available. */ use_s_stream = !v4l2_subdev_has_op(sd, pad, enable_streams); if (!use_s_stream) state = v4l2_subdev_lock_and_get_active_state(sd); else state = NULL; /* * Verify that the requested streams exist and that they are not * already enabled. */ v4l2_subdev_collect_streams(sd, state, pad, streams_mask, &found_streams, &enabled_streams); if (found_streams != streams_mask) { dev_dbg(dev, "streams 0x%llx not found on %s:%u\n", streams_mask & ~found_streams, sd->entity.name, pad); ret = -EINVAL; goto done; } if (enabled_streams) { dev_dbg(dev, "streams 0x%llx already enabled on %s:%u\n", enabled_streams, sd->entity.name, pad); ret = -EALREADY; goto done; } dev_dbg(dev, "enable streams %u:%#llx\n", pad, streams_mask); already_streaming = v4l2_subdev_is_streaming(sd); if (!use_s_stream) { /* Call the .enable_streams() operation. */ ret = v4l2_subdev_call(sd, pad, enable_streams, state, pad, streams_mask); } else { /* Start streaming when the first pad is enabled. */ if (!already_streaming) ret = v4l2_subdev_call(sd, video, s_stream, 1); else ret = 0; } if (ret) { dev_dbg(dev, "enable streams %u:%#llx failed: %d\n", pad, streams_mask, ret); goto done; } /* Mark the streams as enabled. */ v4l2_subdev_set_streams_enabled(sd, state, pad, streams_mask, true); /* * TODO: When all the drivers have been changed to use * v4l2_subdev_enable_streams() and v4l2_subdev_disable_streams(), * instead of calling .s_stream() operation directly, we can remove * the privacy LED handling from call_s_stream() and do it here * for all cases. */ if (!use_s_stream && !already_streaming) v4l2_subdev_enable_privacy_led(sd); done: if (!use_s_stream) v4l2_subdev_unlock_state(state); return ret; } EXPORT_SYMBOL_GPL(v4l2_subdev_enable_streams); int v4l2_subdev_disable_streams(struct v4l2_subdev *sd, u32 pad, u64 streams_mask) { struct device *dev = sd->entity.graph_obj.mdev->dev; struct v4l2_subdev_state *state; u64 enabled_streams; u64 found_streams; bool use_s_stream; int ret; /* A few basic sanity checks first. */ if (pad >= sd->entity.num_pads) return -EINVAL; if (!(sd->entity.pads[pad].flags & MEDIA_PAD_FL_SOURCE)) return -EOPNOTSUPP; /* * We use a 64-bit bitmask for tracking enabled pads, so only subdevices * with 64 pads or less can be supported. */ if (pad >= sizeof(sd->enabled_pads) * BITS_PER_BYTE) return -EOPNOTSUPP; if (!streams_mask) return 0; /* Fallback on .s_stream() if .disable_streams() isn't available. */ use_s_stream = !v4l2_subdev_has_op(sd, pad, disable_streams); if (!use_s_stream) state = v4l2_subdev_lock_and_get_active_state(sd); else state = NULL; /* * Verify that the requested streams exist and that they are not * already disabled. */ v4l2_subdev_collect_streams(sd, state, pad, streams_mask, &found_streams, &enabled_streams); if (found_streams != streams_mask) { dev_dbg(dev, "streams 0x%llx not found on %s:%u\n", streams_mask & ~found_streams, sd->entity.name, pad); ret = -EINVAL; goto done; } if (enabled_streams != streams_mask) { dev_dbg(dev, "streams 0x%llx already disabled on %s:%u\n", streams_mask & ~enabled_streams, sd->entity.name, pad); ret = -EALREADY; goto done; } dev_dbg(dev, "disable streams %u:%#llx\n", pad, streams_mask); if (!use_s_stream) { /* Call the .disable_streams() operation. */ ret = v4l2_subdev_call(sd, pad, disable_streams, state, pad, streams_mask); } else { /* Stop streaming when the last streams are disabled. */ if (!(sd->enabled_pads & ~BIT_ULL(pad))) ret = v4l2_subdev_call(sd, video, s_stream, 0); else ret = 0; } if (ret) { dev_dbg(dev, "disable streams %u:%#llx failed: %d\n", pad, streams_mask, ret); goto done; } v4l2_subdev_set_streams_enabled(sd, state, pad, streams_mask, false); done: if (!use_s_stream) { if (!v4l2_subdev_is_streaming(sd)) v4l2_subdev_disable_privacy_led(sd); v4l2_subdev_unlock_state(state); } return ret; } EXPORT_SYMBOL_GPL(v4l2_subdev_disable_streams); int v4l2_subdev_s_stream_helper(struct v4l2_subdev *sd, int enable) { struct v4l2_subdev_state *state; struct v4l2_subdev_route *route; struct media_pad *pad; u64 source_mask = 0; int pad_index = -1; /* * Find the source pad. This helper is meant for subdevs that have a * single source pad, so failures shouldn't happen, but catch them * loudly nonetheless as they indicate a driver bug. */ media_entity_for_each_pad(&sd->entity, pad) { if (pad->flags & MEDIA_PAD_FL_SOURCE) { pad_index = pad->index; break; } } if (WARN_ON(pad_index == -1)) return -EINVAL; if (sd->flags & V4L2_SUBDEV_FL_STREAMS) { /* * As there's a single source pad, just collect all the source * streams. */ state = v4l2_subdev_lock_and_get_active_state(sd); for_each_active_route(&state->routing, route) source_mask |= BIT_ULL(route->source_stream); v4l2_subdev_unlock_state(state); } else { /* * For non-streams subdevices, there's a single implicit stream * per pad. */ source_mask = BIT_ULL(0); } if (enable) return v4l2_subdev_enable_streams(sd, pad_index, source_mask); else return v4l2_subdev_disable_streams(sd, pad_index, source_mask); } EXPORT_SYMBOL_GPL(v4l2_subdev_s_stream_helper); #endif /* CONFIG_VIDEO_V4L2_SUBDEV_API */ #endif /* CONFIG_MEDIA_CONTROLLER */ void v4l2_subdev_init(struct v4l2_subdev *sd, const struct v4l2_subdev_ops *ops) { INIT_LIST_HEAD(&sd->list); BUG_ON(!ops); sd->ops = ops; sd->v4l2_dev = NULL; sd->flags = 0; sd->name[0] = '\0'; sd->grp_id = 0; sd->dev_priv = NULL; sd->host_priv = NULL; sd->privacy_led = NULL; INIT_LIST_HEAD(&sd->async_subdev_endpoint_list); #if defined(CONFIG_MEDIA_CONTROLLER) sd->entity.name = sd->name; sd->entity.obj_type = MEDIA_ENTITY_TYPE_V4L2_SUBDEV; sd->entity.function = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN; #endif } EXPORT_SYMBOL(v4l2_subdev_init); void v4l2_subdev_notify_event(struct v4l2_subdev *sd, const struct v4l2_event *ev) { v4l2_event_queue(sd->devnode, ev); v4l2_subdev_notify(sd, V4L2_DEVICE_NOTIFY_EVENT, (void *)ev); } EXPORT_SYMBOL_GPL(v4l2_subdev_notify_event); bool v4l2_subdev_is_streaming(struct v4l2_subdev *sd) { struct v4l2_subdev_state *state; if (!v4l2_subdev_has_op(sd, pad, enable_streams)) return sd->s_stream_enabled; if (!(sd->flags & V4L2_SUBDEV_FL_STREAMS)) return !!sd->enabled_pads; state = v4l2_subdev_get_locked_active_state(sd); for (unsigned int i = 0; i < state->stream_configs.num_configs; ++i) { const struct v4l2_subdev_stream_config *cfg; cfg = &state->stream_configs.configs[i]; if (cfg->enabled) return true; } return false; } EXPORT_SYMBOL_GPL(v4l2_subdev_is_streaming); int v4l2_subdev_get_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) sd->privacy_led = led_get(sd->dev, "privacy-led"); if (IS_ERR(sd->privacy_led) && PTR_ERR(sd->privacy_led) != -ENOENT) return dev_err_probe(sd->dev, PTR_ERR(sd->privacy_led), "getting privacy LED\n"); if (!IS_ERR_OR_NULL(sd->privacy_led)) { mutex_lock(&sd->privacy_led->led_access); led_sysfs_disable(sd->privacy_led); led_trigger_remove(sd->privacy_led); led_set_brightness(sd->privacy_led, 0); mutex_unlock(&sd->privacy_led->led_access); } #endif return 0; } EXPORT_SYMBOL_GPL(v4l2_subdev_get_privacy_led); void v4l2_subdev_put_privacy_led(struct v4l2_subdev *sd) { #if IS_REACHABLE(CONFIG_LEDS_CLASS) if (!IS_ERR_OR_NULL(sd->privacy_led)) { mutex_lock(&sd->privacy_led->led_access); led_sysfs_enable(sd->privacy_led); mutex_unlock(&sd->privacy_led->led_access); led_put(sd->privacy_led); } #endif } EXPORT_SYMBOL_GPL(v4l2_subdev_put_privacy_led);
91 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef DECOMPRESSOR_H #define DECOMPRESSOR_H /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 * Phillip Lougher <phillip@squashfs.org.uk> * * decompressor.h */ #include <linux/bio.h> struct squashfs_decompressor { void *(*init)(struct squashfs_sb_info *, void *); void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); int (*decompress)(struct squashfs_sb_info *, void *, struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; int alloc_buffer; int supported; }; static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk, void *buff, int length) { return msblk->decompressor->comp_opts ? msblk->decompressor->comp_opts(msblk, buff, length) : NULL; } #ifdef CONFIG_SQUASHFS_XZ extern const struct squashfs_decompressor squashfs_xz_comp_ops; #endif #ifdef CONFIG_SQUASHFS_LZ4 extern const struct squashfs_decompressor squashfs_lz4_comp_ops; #endif #ifdef CONFIG_SQUASHFS_LZO extern const struct squashfs_decompressor squashfs_lzo_comp_ops; #endif #ifdef CONFIG_SQUASHFS_ZLIB extern const struct squashfs_decompressor squashfs_zlib_comp_ops; #endif #ifdef CONFIG_SQUASHFS_ZSTD extern const struct squashfs_decompressor squashfs_zstd_comp_ops; #endif #endif
248 248 22 254 253 261 262 4 2 4 261 260 4 3 257 1 1 256 256 253 252 253 5 251 251 3 2 1 2 2 1 1 1 248 247 248 1 249 248 246 247 248 11 14 3 13 11 215 230 230 204 204 204 230 204 26 228 230 2 2 10 251 251 250 249 248 247 248 248 248 247 247 248 248 168 248 246 247 248 248 248 248 248 85 248 12 234 19 234 234 234 234 234 3 231 231 1 230 228 228 149 79 228 228 228 10 227 149 228 214 214 2 2 4 5 8 8 20 20 20 20 20 20 20 19 20 20 2 2 2 2 14 14 1 414 413 411 219 3 217 219 411 411 410 411 31 1 1 30 27 175 175 174 175 25 25 80 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" #include "xfs_icache.h" #include "xfs_sysfs.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_reflink.h" #include "xfs_extent_busy.h" #include "xfs_health.h" #include "xfs_trace.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" #include "xfs_metafile.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; void xfs_uuid_table_free(void) { if (xfs_uuid_table_size == 0) return; kfree(xfs_uuid_table); xfs_uuid_table = NULL; xfs_uuid_table_size = 0; } /* * See if the UUID is unique among mounted XFS filesystems. * Mount fails if UUID is nil or a FS with the same UUID is already mounted. */ STATIC int xfs_uuid_mount( struct xfs_mount *mp) { uuid_t *uuid = &mp->m_sb.sb_uuid; int hole, i; /* Publish UUID in struct super_block */ super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid)); if (xfs_has_nouuid(mp)) return 0; if (uuid_is_null(uuid)) { xfs_warn(mp, "Filesystem has null UUID - can't mount"); return -EINVAL; } mutex_lock(&xfs_uuid_table_mutex); for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { if (uuid_is_null(&xfs_uuid_table[i])) { hole = i; continue; } if (uuid_equal(uuid, &xfs_uuid_table[i])) goto out_duplicate; } if (hole < 0) { xfs_uuid_table = krealloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), GFP_KERNEL | __GFP_NOFAIL); hole = xfs_uuid_table_size++; } xfs_uuid_table[hole] = *uuid; mutex_unlock(&xfs_uuid_table_mutex); return 0; out_duplicate: mutex_unlock(&xfs_uuid_table_mutex); xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); return -EINVAL; } STATIC void xfs_uuid_unmount( struct xfs_mount *mp) { uuid_t *uuid = &mp->m_sb.sb_uuid; int i; if (xfs_has_nouuid(mp)) return; mutex_lock(&xfs_uuid_table_mutex); for (i = 0; i < xfs_uuid_table_size; i++) { if (uuid_is_null(&xfs_uuid_table[i])) continue; if (!uuid_equal(uuid, &xfs_uuid_table[i])) continue; memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); break; } ASSERT(i < xfs_uuid_table_size); mutex_unlock(&xfs_uuid_table_mutex); } /* * Check size of device based on the (data/realtime) block count. * Note: this check is used by the growfs code as well as mount. */ int xfs_sb_validate_fsb_count( xfs_sb_t *sbp, uint64_t nblocks) { uint64_t max_bytes; ASSERT(sbp->sb_blocklog >= BBSHIFT); if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes)) return -EFBIG; /* Limited by ULONG_MAX of page cache index */ if (max_bytes >> PAGE_SHIFT > ULONG_MAX) return -EFBIG; return 0; } /* * xfs_readsb * * Does the initial read of the superblock. */ int xfs_readsb( struct xfs_mount *mp, int flags) { unsigned int sector_size; struct xfs_buf *bp; struct xfs_sb *sbp = &mp->m_sb; int error; int loud = !(flags & XFS_MFSI_QUIET); const struct xfs_buf_ops *buf_ops; ASSERT(mp->m_sb_bp == NULL); ASSERT(mp->m_ddev_targp != NULL); /* * For the initial read, we must guess at the sector * size based on the block device. It's enough to * get the sb_sectsize out of the superblock and * then reread with the proper length. * We don't verify it yet, because it may not be complete. */ sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); buf_ops = NULL; /* * Allocate a (locked) buffer to hold the superblock. This will be kept * around at all times to optimize access to the superblock. Therefore, * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count * elevated. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), XBF_NO_IOACCT, &bp, buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; return error; } /* * Initialize the mount structure from the superblock. */ xfs_sb_from_disk(sbp, bp->b_addr); /* * If we haven't validated the superblock, do so now before we try * to check the sector size and reread the superblock appropriately. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { if (loud) xfs_warn(mp, "Invalid superblock magic number"); error = -EINVAL; goto release_buf; } /* * We must be able to do sector-sized and sector-aligned IO. */ if (sector_size > sbp->sb_sectsize) { if (loud) xfs_warn(mp, "device supports %u byte sectors (not %u)", sector_size, sbp->sb_sectsize); error = -ENOSYS; goto release_buf; } if (buf_ops == NULL) { /* * Re-read the superblock so the buffer is correctly sized, * and properly verified. */ xfs_buf_relse(bp); sector_size = sbp->sb_sectsize; buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; goto reread; } mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); /* * If logged xattrs are enabled after log recovery finishes, then set * the opstate so that log recovery will work properly. */ if (xfs_sb_version_haslogxattrs(&mp->m_sb)) xfs_set_using_logged_xattrs(mp); /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; mp->m_sb_bp = bp; xfs_buf_unlock(bp); return 0; release_buf: xfs_buf_relse(bp); return error; } /* * If the sunit/swidth change would move the precomputed root inode value, we * must reject the ondisk change because repair will stumble over that. * However, we allow the mount to proceed because we never rejected this * combination before. Returns true to update the sb, false otherwise. */ static inline int xfs_check_new_dalign( struct xfs_mount *mp, int new_dalign, bool *update_sb) { struct xfs_sb *sbp = &mp->m_sb; xfs_ino_t calc_ino; calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); if (sbp->sb_rootino == calc_ino) { *update_sb = true; return 0; } xfs_warn(mp, "Cannot change stripe alignment; would require moving root inode."); /* * XXX: Next time we add a new incompat feature, this should start * returning -EINVAL to fail the mount. Until then, spit out a warning * that we're ignoring the administrator's instructions. */ xfs_warn(mp, "Skipping superblock stripe alignment update."); *update_sb = false; return 0; } /* * If we were provided with new sunit/swidth values as mount options, make sure * that they pass basic alignment and superblock feature checks, and convert * them into the same units (FSB) that everything else expects. This step * /must/ be done before computing the inode geometry. */ STATIC int xfs_validate_new_dalign( struct xfs_mount *mp) { if (mp->m_dalign == 0) return 0; /* * If stripe unit and stripe width are not multiples * of the fs blocksize turn off alignment. */ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || (BBTOB(mp->m_swidth) & mp->m_blockmask)) { xfs_warn(mp, "alignment check failed: sunit/swidth vs. blocksize(%d)", mp->m_sb.sb_blocksize); return -EINVAL; } /* * Convert the stripe unit and width to FSBs. */ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { xfs_warn(mp, "alignment check failed: sunit/swidth vs. agsize(%d)", mp->m_sb.sb_agblocks); return -EINVAL; } if (!mp->m_dalign) { xfs_warn(mp, "alignment check failed: sunit(%d) less than bsize(%d)", mp->m_dalign, mp->m_sb.sb_blocksize); return -EINVAL; } mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); if (!xfs_has_dalign(mp)) { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); return -EINVAL; } return 0; } /* Update alignment values based on mount options and sb values. */ STATIC int xfs_update_alignment( struct xfs_mount *mp) { struct xfs_sb *sbp = &mp->m_sb; if (mp->m_dalign) { bool update_sb; int error; if (sbp->sb_unit == mp->m_dalign && sbp->sb_width == mp->m_swidth) return 0; error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); if (error || !update_sb) return error; sbp->sb_unit = mp->m_dalign; sbp->sb_width = mp->m_swidth; mp->m_update_sb = true; } else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) { mp->m_dalign = sbp->sb_unit; mp->m_swidth = sbp->sb_width; } return 0; } /* * precalculate the low space thresholds for dynamic speculative preallocation. */ void xfs_set_low_space_thresholds( struct xfs_mount *mp) { uint64_t dblocks = mp->m_sb.sb_dblocks; uint64_t rtexts = mp->m_sb.sb_rextents; int i; do_div(dblocks, 100); do_div(rtexts, 100); for (i = 0; i < XFS_LOWSP_MAX; i++) { mp->m_low_space[i] = dblocks * (i + 1); mp->m_low_rtexts[i] = rtexts * (i + 1); } } /* * Check that the data (and log if separate) is an ok size. */ STATIC int xfs_check_sizes( struct xfs_mount *mp) { struct xfs_buf *bp; xfs_daddr_t d; int error; d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { xfs_warn(mp, "filesystem size mismatch detected"); return -EFBIG; } error = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); if (error) { xfs_warn(mp, "last sector read failed"); return error; } xfs_buf_relse(bp); if (mp->m_logdev_targp == mp->m_ddev_targp) return 0; d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { xfs_warn(mp, "log size mismatch detected"); return -EFBIG; } error = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); if (error) { xfs_warn(mp, "log device read failed"); return error; } xfs_buf_relse(bp); return 0; } /* * Clear the quotaflags in memory and in the superblock. */ int xfs_mount_reset_sbqflags( struct xfs_mount *mp) { mp->m_qflags = 0; /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */ if (mp->m_sb.sb_qflags == 0) return 0; spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = 0; spin_unlock(&mp->m_sb_lock); if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) return 0; return xfs_sync_sb(mp, false); } uint64_t xfs_default_resblks(xfs_mount_t *mp) { uint64_t resblks; /* * We default to 5% or 8192 fsbs of space reserved, whichever is * smaller. This is intended to cover concurrent allocation * transactions when we initially hit enospc. These each require a 4 * block reservation. Hence by default we cover roughly 2000 concurrent * allocation reservations. */ resblks = mp->m_sb.sb_dblocks; do_div(resblks, 20); resblks = min_t(uint64_t, resblks, 8192); return resblks; } /* Ensure the summary counts are correct. */ STATIC int xfs_check_summary_counts( struct xfs_mount *mp) { int error = 0; /* * The AG0 superblock verifier rejects in-progress filesystems, * so we should never see the flag set this far into mounting. */ if (mp->m_sb.sb_inprogress) { xfs_err(mp, "sb_inprogress set after log recovery??"); WARN_ON(1); return -EFSCORRUPTED; } /* * Now the log is mounted, we know if it was an unclean shutdown or * not. If it was, with the first phase of recovery has completed, we * have consistent AG blocks on disk. We have not recovered EFIs yet, * but they are recovered transactionally in the second recovery phase * later. * * If the log was clean when we mounted, we can check the summary * counters. If any of them are obviously incorrect, we can recompute * them from the AGF headers in the next step. */ if (xfs_is_clean(mp) && (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks || !xfs_verify_icount(mp, mp->m_sb.sb_icount) || mp->m_sb.sb_ifree > mp->m_sb.sb_icount)) xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); /* * We can safely re-initialise incore superblock counters from the * per-ag data. These may not be correct if the filesystem was not * cleanly unmounted, so we waited for recovery to finish before doing * this. * * If the filesystem was cleanly unmounted or the previous check did * not flag anything weird, then we can trust the values in the * superblock to be correct and we don't need to do anything here. * Otherwise, recalculate the summary counters. */ if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) || xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) { error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); if (error) return error; } /* * Older kernels misused sb_frextents to reflect both incore * reservations made by running transactions and the actual count of * free rt extents in the ondisk metadata. Transactions committed * during runtime can therefore contain a superblock update that * undercounts the number of free rt extents tracked in the rt bitmap. * A clean unmount record will have the correct frextents value since * there can be no other transactions running at that point. * * If we're mounting the rt volume after recovering the log, recompute * frextents from the rtbitmap file to fix the inconsistency. */ if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { error = xfs_rtalloc_reinit_frextents(mp); if (error) return error; } return 0; } static void xfs_unmount_check( struct xfs_mount *mp) { if (xfs_is_shutdown(mp)) return; if (percpu_counter_sum(&mp->m_ifree) > percpu_counter_sum(&mp->m_icount)) { xfs_alert(mp, "ifree/icount mismatch at unmount"); xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); } } /* * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, * so we need to unpin them, write them back and/or reclaim them before unmount * can proceed. In other words, callers are required to have inactivated all * inodes. * * An inode cluster that has been freed can have its buffer still pinned in * memory because the transaction is still sitting in a iclog. The stale inodes * on that buffer will be pinned to the buffer until the transaction hits the * disk and the callbacks run. Pushing the AIL will skip the stale inodes and * may never see the pinned buffer, so nothing will push out the iclog and * unpin the buffer. * * Hence we need to force the log to unpin everything first. However, log * forces don't wait for the discards they issue to complete, so we have to * explicitly wait for them to complete here as well. * * Then we can tell the world we are unmounting so that error handling knows * that the filesystem is going away and we should error out anything that we * have been retrying in the background. This will prevent never-ending * retries in AIL pushing from hanging the unmount. * * Finally, we can push the AIL to clean all the remaining dirty objects, then * reclaim the remaining inodes that are still in memory at this point in time. */ static void xfs_unmount_flush_inodes( struct xfs_mount *mp) { xfs_log_force(mp, XFS_LOG_SYNC); xfs_extent_busy_wait_all(mp); flush_workqueue(xfs_discard_wq); xfs_set_unmounting(mp); xfs_ail_push_all_sync(mp->m_ail); xfs_inodegc_stop(mp); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp); xfs_health_unmount(mp); } static void xfs_mount_setup_inode_geom( struct xfs_mount *mp) { struct xfs_ino_geometry *igeo = M_IGEO(mp); igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp); ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp)); xfs_ialloc_setup_geometry(mp); } /* Mount the metadata directory tree root. */ STATIC int xfs_mount_setup_metadir( struct xfs_mount *mp) { int error; /* Load the metadata directory root inode into memory. */ error = xfs_metafile_iget(mp, mp->m_sb.sb_metadirino, XFS_METAFILE_DIR, &mp->m_metadirip); if (error) xfs_warn(mp, "Failed to load metadir root directory, error %d", error); return error; } /* Compute maximum possible height for per-AG btree types for this fs. */ static inline void xfs_agbtree_compute_maxlevels( struct xfs_mount *mp) { unsigned int levels; levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels); levels = max(levels, mp->m_rmap_maxlevels); mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); } /* Compute maximum possible height for realtime btree types for this fs. */ static inline void xfs_rtbtree_compute_maxlevels( struct xfs_mount *mp) { mp->m_rtbtree_maxlevels = max(mp->m_rtrmap_maxlevels, mp->m_rtrefc_maxlevels); } /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct * - if we're a 32-bit kernel, do a size check on the superblock * so we don't mount terabyte filesystems * - init mount struct realtime fields * - allocate inode hash table for fs * - init directory manager * - perform recovery and init the log manager */ int xfs_mountfs( struct xfs_mount *mp) { struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; struct xfs_ino_geometry *igeo = M_IGEO(mp); uint quotamount = 0; uint quotaflags = 0; int error = 0; xfs_sb_mount_common(mp, sbp); /* * Check for a mismatched features2 values. Older kernels read & wrote * into the wrong sb offset for sb_features2 on some platforms due to * xfs_sb_t not being 64bit size aligned when sb_features2 was added, * which made older superblock reading/writing routines swap it as a * 64-bit value. * * For backwards compatibility, we make both slots equal. * * If we detect a mismatched field, we OR the set bits into the existing * features2 field in case it has already been modified; we don't want * to lose any features. We then update the bad location with the ORed * value so that older kernels will see any features2 flags. The * superblock writeback code ensures the new sb_features2 is copied to * sb_bad_features2 before it is logged or written to disk. */ if (xfs_sb_has_mismatched_features2(sbp)) { xfs_warn(mp, "correcting sb_features alignment problem"); sbp->sb_features2 |= sbp->sb_bad_features2; mp->m_update_sb = true; } /* always use v2 inodes by default now */ if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; mp->m_features |= XFS_FEAT_NLINK; mp->m_update_sb = true; } /* * If we were given new sunit/swidth options, do some basic validation * checks and convert the incore dalign and swidth values to the * same units (FSB) that everything else uses. This /must/ happen * before computing the inode geometry. */ error = xfs_validate_new_dalign(mp); if (error) goto out; xfs_alloc_compute_maxlevels(mp); xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); xfs_mount_setup_inode_geom(mp); xfs_rmapbt_compute_maxlevels(mp); xfs_rtrmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); xfs_rtrefcountbt_compute_maxlevels(mp); xfs_agbtree_compute_maxlevels(mp); xfs_rtbtree_compute_maxlevels(mp); /* * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks * is NOT aligned turn off m_dalign since allocator alignment is within * an ag, therefore ag has to be aligned at stripe boundary. Note that * we must compute the free space and rmap btree geometry before doing * this. */ error = xfs_update_alignment(mp); if (error) goto out; /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; super_set_sysfs_name_id(mp->m_super); error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_super->s_id); if (error) goto out; error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, &mp->m_kobj, "stats"); if (error) goto out_remove_sysfs; xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); error = xfs_error_sysfs_init(mp); if (error) goto out_remove_scrub_stats; error = xfs_errortag_init(mp); if (error) goto out_remove_error_sysfs; error = xfs_uuid_mount(mp); if (error) goto out_remove_errortag; /* * Update the preferred write size based on the information from the * on-disk superblock. */ mp->m_allocsize_log = max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log); mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog); /* set the low space thresholds for dynamic preallocation */ xfs_set_low_space_thresholds(mp); /* * If enabled, sparse inode chunk alignment is expected to match the * cluster size. Full inode chunk alignment must match the chunk size, * but that is checked on sb read verification... */ if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align != XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) { xfs_warn(mp, "Sparse inode block alignment (%u) must match cluster size (%llu).", mp->m_sb.sb_spino_align, XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)); error = -EINVAL; goto out_remove_uuid; } /* * Check that the data (and log if separate) is an ok size. */ error = xfs_check_sizes(mp); if (error) goto out_remove_uuid; /* * Initialize realtime fields in the mount structure */ error = xfs_rtmount_init(mp); if (error) { xfs_warn(mp, "RT mount failed"); goto out_remove_uuid; } /* * Copies the low order bits of the timestamp and the randomly * set "sequence" number out of a UUID. */ mp->m_fixedfsid[0] = (get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) | get_unaligned_be16(&sbp->sb_uuid.b[4]); mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]); error = xfs_da_mount(mp); if (error) { xfs_warn(mp, "Failed dir/attr init: %d", error); goto out_remove_uuid; } /* * Initialize the precomputed transaction reservations values. */ xfs_trans_init(mp); /* * Allocate and initialize the per-ag data. */ error = xfs_initialize_perag(mp, 0, sbp->sb_agcount, mp->m_sb.sb_dblocks, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); goto out_free_dir; } error = xfs_initialize_rtgroups(mp, 0, sbp->sb_rgcount, mp->m_sb.sb_rextents); if (error) { xfs_warn(mp, "Failed rtgroup init: %d", error); goto out_free_perag; } if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) { xfs_warn(mp, "no log defined"); error = -EFSCORRUPTED; goto out_free_rtgroup; } error = xfs_inodegc_register_shrinker(mp); if (error) goto out_fail_wait; /* * If we're resuming quota status, pick up the preliminary qflags from * the ondisk superblock so that we know if we should recover dquots. */ if (xfs_is_resuming_quotaon(mp)) xfs_qm_resume_quotaon(mp); /* * Log's mount-time initialization. The first part of recovery can place * some items on the AIL, to be handled when recovery is finished or * cancelled. */ error = xfs_log_mount(mp, mp->m_logdev_targp, XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); if (error) { xfs_warn(mp, "log mount failed"); goto out_inodegc_shrinker; } /* * If we're resuming quota status and recovered the log, re-sample the * qflags from the ondisk superblock now that we've recovered it, just * in case someone shut down enforcement just before a crash. */ if (xfs_clear_resuming_quotaon(mp) && xlog_recovery_needed(mp->m_log)) xfs_qm_resume_quotaon(mp); /* * If logged xattrs are still enabled after log recovery finishes, then * they'll be available until unmount. Otherwise, turn them off. */ if (xfs_sb_version_haslogxattrs(&mp->m_sb)) xfs_set_using_logged_xattrs(mp); else xfs_clear_using_logged_xattrs(mp); /* Enable background inode inactivation workers. */ xfs_inodegc_start(mp); xfs_blockgc_start(mp); /* * Now that we've recovered any pending superblock feature bit * additions, we can finish setting up the attr2 behaviour for the * mount. The noattr2 option overrides the superblock flag, so only * check the superblock feature flag if the mount option is not set. */ if (xfs_has_noattr2(mp)) { mp->m_features &= ~XFS_FEAT_ATTR2; } else if (!xfs_has_attr2(mp) && (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { mp->m_features |= XFS_FEAT_ATTR2; } if (xfs_has_metadir(mp)) { error = xfs_mount_setup_metadir(mp); if (error) goto out_free_metadir; } /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. */ error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED, XFS_ILOCK_EXCL, &rip); if (error) { xfs_warn(mp, "Failed to read root inode 0x%llx, error %d", sbp->sb_rootino, -error); goto out_free_metadir; } ASSERT(rip != NULL); if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) { xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); error = -EFSCORRUPTED; goto out_rele_rip; } mp->m_rootip = rip; /* save it */ xfs_iunlock(rip, XFS_ILOCK_EXCL); /* * Initialize realtime inode pointers in the mount structure */ error = xfs_rtmount_inodes(mp); if (error) { /* * Free up the root inode. */ xfs_warn(mp, "failed to read RT inodes"); goto out_rele_rip; } /* Make sure the summary counts are ok. */ error = xfs_check_summary_counts(mp); if (error) goto out_rtunmount; /* * If this is a read-only mount defer the superblock updates until * the next remount into writeable mode. Otherwise we would never * perform the update e.g. for the root filesystem. */ if (mp->m_update_sb && !xfs_is_readonly(mp)) { error = xfs_sync_sb(mp, false); if (error) { xfs_warn(mp, "failed to write sb changes"); goto out_rtunmount; } } /* * Initialise the XFS quota management subsystem for this mount */ if (XFS_IS_QUOTA_ON(mp)) { error = xfs_qm_newmount(mp, &quotamount, &quotaflags); if (error) goto out_rtunmount; } else { /* * If a file system had quotas running earlier, but decided to * mount without -o uquota/pquota/gquota options, revoke the * quotachecked license. */ if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { xfs_notice(mp, "resetting quota flags"); error = xfs_mount_reset_sbqflags(mp); if (error) goto out_rtunmount; } } /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently * read in. Temporarily create per-AG space reservations for metadata * btree shape changes because space freeing transactions (for inode * inactivation) require the per-AG reservation in lieu of reserving * blocks. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error == -ENOSPC) xfs_warn(mp, "ENOSPC reserving per-AG metadata pool, log recovery may fail."); error = xfs_log_mount_finish(mp); xfs_fs_unreserve_ag_blocks(mp); if (error) { xfs_warn(mp, "log mount finish failed"); goto out_rtunmount; } /* * Now the log is fully replayed, we can transition to full read-only * mode for read-only mounts. This will sync all the metadata and clean * the log so that the recovery we just performed does not have to be * replayed again on the next mount. * * We use the same quiesce mechanism as the rw->ro remount, as they are * semantically identical operations. */ if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) xfs_log_clean(mp); /* * Complete the quota initialisation, post-log-replay component. */ if (quotamount) { ASSERT(mp->m_qflags == 0); mp->m_qflags = quotaflags; xfs_qm_mount_quotas(mp); } /* * Now we are mounted, reserve a small amount of unused space for * privileged transactions. This is needed so that transaction * space required for critical operations can dip into this pool * when at ENOSPC. This is needed for operations like create with * attr, unwritten extent conversion at ENOSPC, etc. Data allocations * are not allowed to use this reserved space. * * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) goto out_agresv; } return 0; out_agresv: xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: xfs_irele(rip); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); out_free_metadir: if (mp->m_metadirip) xfs_irele(mp->m_metadirip); /* * Inactivate all inodes that might still be in memory after a log * intent recovery failure so that reclaim can free them. Metadata * inodes and the root directory shouldn't need inactivation, but the * mount failed for some reason, so pull down all the state and flee. */ xfs_inodegc_flush(mp); /* * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those * two will have scheduled delayed reclaim for the rt/quota inodes. * * This is slightly different from the unmountfs call sequence * because we could be tearing down a partially set up mount. In * particular, if log_mount_finish fails we bail out without calling * qm_unmount_quotas and therefore rely on qm_unmount to release the * quota inodes. */ xfs_unmount_flush_inodes(mp); xfs_log_mount_cancel(mp); out_inodegc_shrinker: shrinker_free(mp->m_inodegc_shrinker); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); xfs_buftarg_drain(mp->m_ddev_targp); out_free_rtgroup: xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); out_free_perag: xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); out_free_dir: xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); out_remove_errortag: xfs_errortag_del(mp); out_remove_error_sysfs: xfs_error_sysfs_del(mp); out_remove_scrub_stats: xchk_stats_unregister(mp->m_scrub_stats); xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: xfs_sysfs_del(&mp->m_kobj); out: return error; } /* * This flushes out the inodes,dquots and the superblock, unmounts the * log and makes sure that incore structures are freed. */ void xfs_unmountfs( struct xfs_mount *mp) { int error; /* * Perform all on-disk metadata updates required to inactivate inodes * that the VFS evicted earlier in the unmount process. Freeing inodes * and discarding CoW fork preallocations can cause shape changes to * the free inode and refcount btrees, respectively, so we must finish * this before we discard the metadata space reservations. Metadata * inodes and the root directory do not require inactivation. */ xfs_inodegc_flush(mp); xfs_blockgc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); if (mp->m_metadirip) xfs_irele(mp->m_metadirip); xfs_unmount_flush_inodes(mp); xfs_qm_unmount(mp); /* * Unreserve any blocks we have so that when we unmount we don't account * the reserved free space as used. This is really only necessary for * lazy superblock counting because it trusts the incore superblock * counters to be absolutely correct on clean unmount. * * We don't bother correcting this elsewhere for lazy superblock * counting because on mount of an unclean filesystem we reconstruct the * correct counter value and this is irrelevant. * * For non-lazy counter filesystems, this doesn't matter at all because * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ error = xfs_reserve_blocks(mp, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); xfs_unmount_check(mp); /* * Indicate that it's ok to clear log incompat bits before cleaning * the log and writing the unmount record. */ xfs_set_done_with_log_incompat(mp); xfs_log_unmount(mp); xfs_da_unmount(mp); xfs_uuid_unmount(mp); #if defined(DEBUG) xfs_errortag_clearall(mp); #endif shrinker_free(mp->m_inodegc_shrinker); xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); xfs_errortag_del(mp); xfs_error_sysfs_del(mp); xchk_stats_unregister(mp->m_scrub_stats); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } /* * Determine whether modifications can proceed. The caller specifies the minimum * freeze level for which modifications should not be allowed. This allows * certain operations to proceed while the freeze sequence is in progress, if * necessary. */ bool xfs_fs_writable( struct xfs_mount *mp, int level) { ASSERT(level > SB_UNFROZEN); if ((mp->m_super->s_writers.frozen >= level) || xfs_is_shutdown(mp) || xfs_is_readonly(mp)) return false; return true; } void xfs_add_freecounter( struct xfs_mount *mp, struct percpu_counter *counter, uint64_t delta) { bool has_resv_pool = (counter == &mp->m_fdblocks); uint64_t res_used; /* * If the reserve pool is depleted, put blocks back into it first. * Most of the time the pool is full. */ if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { percpu_counter_add(counter, delta); return; } spin_lock(&mp->m_sb_lock); res_used = mp->m_resblks - mp->m_resblks_avail; if (res_used > delta) { mp->m_resblks_avail += delta; } else { delta -= res_used; mp->m_resblks_avail = mp->m_resblks; percpu_counter_add(counter, delta); } spin_unlock(&mp->m_sb_lock); } int xfs_dec_freecounter( struct xfs_mount *mp, struct percpu_counter *counter, uint64_t delta, bool rsvd) { int64_t lcounter; uint64_t set_aside = 0; s32 batch; bool has_resv_pool; ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); has_resv_pool = (counter == &mp->m_fdblocks); if (rsvd) ASSERT(has_resv_pool); /* * Taking blocks away, need to be more accurate the closer we * are to zero. * * If the counter has a value of less than 2 * max batch size, * then make everything serialise as we are real close to * ENOSPC. */ if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else batch = XFS_FDBLOCKS_BATCH; /* * Set aside allocbt blocks because these blocks are tracked as free * space but not available for allocation. Technically this means that a * single reservation cannot consume all remaining free space, but the * ratio of allocbt blocks to usable free blocks should be rather small. * The tradeoff without this is that filesystems that maintain high * perag block reservations can over reserve physical block availability * and fail physical allocation, which leads to much more serious * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ if (has_resv_pool) set_aside = xfs_fdblocks_unavailable(mp); percpu_counter_add_batch(counter, -((int64_t)delta), batch); if (__percpu_counter_compare(counter, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; } /* * lock up the sb for dipping into reserves before releasing the space * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); percpu_counter_add(counter, delta); if (!has_resv_pool || !rsvd) goto fdblocks_enospc; lcounter = (long long)mp->m_resblks_avail - delta; if (lcounter >= 0) { mp->m_resblks_avail = lcounter; spin_unlock(&mp->m_sb_lock); return 0; } xfs_warn_once(mp, "Reserve blocks depleted! Consider increasing reserve pool size."); fdblocks_enospc: spin_unlock(&mp->m_sb_lock); return -ENOSPC; } /* * Used to free the superblock along various error paths. */ void xfs_freesb( struct xfs_mount *mp) { struct xfs_buf *bp = mp->m_sb_bp; xfs_buf_lock(bp); mp->m_sb_bp = NULL; xfs_buf_relse(bp); } /* * If the underlying (data/log/rt) device is readonly, there are some * operations that cannot proceed. */ int xfs_dev_is_read_only( struct xfs_mount *mp, char *message) { if (xfs_readonly_buftarg(mp->m_ddev_targp) || xfs_readonly_buftarg(mp->m_logdev_targp) || (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { xfs_notice(mp, "%s required on read-only device.", message); xfs_notice(mp, "write access unavailable, cannot proceed."); return -EROFS; } return 0; } /* Force the summary counters to be recalculated at next mount. */ void xfs_force_summary_recalc( struct xfs_mount *mp) { if (!xfs_has_lazysbcount(mp)) return; xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); } /* * Enable a log incompat feature flag in the primary superblock. The caller * cannot have any other transactions in progress. */ int xfs_add_incompat_log_feature( struct xfs_mount *mp, uint32_t feature) { struct xfs_dsb *dsb; int error; ASSERT(hweight32(feature) == 1); ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); /* * Force the log to disk and kick the background AIL thread to reduce * the chances that the bwrite will stall waiting for the AIL to unpin * the primary superblock buffer. This isn't a data integrity * operation, so we don't need a synchronous push. */ error = xfs_log_force(mp, XFS_LOG_SYNC); if (error) return error; xfs_ail_push_all(mp->m_ail); /* * Lock the primary superblock buffer to serialize all callers that * are trying to set feature bits. */ xfs_buf_lock(mp->m_sb_bp); xfs_buf_hold(mp->m_sb_bp); if (xfs_is_shutdown(mp)) { error = -EIO; goto rele; } if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature)) goto rele; /* * Write the primary superblock to disk immediately, because we need * the log_incompat bit to be set in the primary super now to protect * the log items that we're going to commit later. */ dsb = mp->m_sb_bp->b_addr; xfs_sb_to_disk(dsb, &mp->m_sb); dsb->sb_features_log_incompat |= cpu_to_be32(feature); error = xfs_bwrite(mp->m_sb_bp); if (error) goto shutdown; /* * Add the feature bits to the incore superblock before we unlock the * buffer. */ xfs_sb_add_incompat_log_features(&mp->m_sb, feature); xfs_buf_relse(mp->m_sb_bp); /* Log the superblock to disk. */ return xfs_sync_sb(mp, false); shutdown: xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); rele: xfs_buf_relse(mp->m_sb_bp); return error; } /* * Clear all the log incompat flags from the superblock. * * The caller cannot be in a transaction, must ensure that the log does not * contain any log items protected by any log incompat bit, and must ensure * that there are no other threads that depend on the state of the log incompat * feature flags in the primary super. * * Returns true if the superblock is dirty. */ bool xfs_clear_incompat_log_features( struct xfs_mount *mp) { bool ret = false; if (!xfs_has_crc(mp) || !xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL) || xfs_is_shutdown(mp) || !xfs_is_done_with_log_incompat(mp)) return false; /* * Update the incore superblock. We synchronize on the primary super * buffer lock to be consistent with the add function, though at least * in theory this shouldn't be necessary. */ xfs_buf_lock(mp->m_sb_bp); xfs_buf_hold(mp->m_sb_bp); if (xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { xfs_sb_remove_incompat_log_features(&mp->m_sb); ret = true; } xfs_buf_relse(mp->m_sb_bp); return ret; } /* * Update the in-core delayed block counter. * * We prefer to update the counter without having to take a spinlock for every * counter update (i.e. batching). Each change to delayed allocation * reservations can change can easily exceed the default percpu counter * batching, so we use a larger batch factor here. * * Note that we don't currently have any callers requiring fast summation * (e.g. percpu_counter_read) so we can use a big batch value here. */ #define XFS_DELALLOC_BATCH (4096) void xfs_mod_delalloc( struct xfs_inode *ip, int64_t data_delta, int64_t ind_delta) { struct xfs_mount *mp = ip->i_mount; if (XFS_IS_REALTIME_INODE(ip)) { percpu_counter_add_batch(&mp->m_delalloc_rtextents, xfs_blen_to_rtbxlen(mp, data_delta), XFS_DELALLOC_BATCH); if (!ind_delta) return; data_delta = 0; } percpu_counter_add_batch(&mp->m_delalloc_blks, data_delta + ind_delta, XFS_DELALLOC_BATCH); }
49 49 47 40 39 37 24 36 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 // SPDX-License-Identifier: GPL-2.0-or-later /* Key garbage collector * * Copyright (C) 2009-2011 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/security.h> #include <keys/keyring-type.h> #include "internal.h" /* * Delay between key revocation/expiry in seconds */ unsigned key_gc_delay = 5 * 60; /* * Reaper for unused keys. */ static void key_garbage_collector(struct work_struct *work); DECLARE_WORK(key_gc_work, key_garbage_collector); /* * Reaper for links from keyrings to dead keys. */ static void key_gc_timer_func(struct timer_list *); static DEFINE_TIMER(key_gc_timer, key_gc_timer_func); static time64_t key_gc_next_run = TIME64_MAX; static struct key_type *key_gc_dead_keytype; static unsigned long key_gc_flags; #define KEY_GC_KEY_EXPIRED 0 /* A key expired and needs unlinking */ #define KEY_GC_REAP_KEYTYPE 1 /* A keytype is being unregistered */ #define KEY_GC_REAPING_KEYTYPE 2 /* Cleared when keytype reaped */ /* * Any key whose type gets unregistered will be re-typed to this if it can't be * immediately unlinked. */ struct key_type key_type_dead = { .name = ".dead", }; /* * Schedule a garbage collection run. * - time precision isn't particularly important */ void key_schedule_gc(time64_t gc_at) { unsigned long expires; time64_t now = ktime_get_real_seconds(); kenter("%lld", gc_at - now); if (gc_at <= now || test_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) { kdebug("IMMEDIATE"); schedule_work(&key_gc_work); } else if (gc_at < key_gc_next_run) { kdebug("DEFERRED"); key_gc_next_run = gc_at; expires = jiffies + (gc_at - now) * HZ; mod_timer(&key_gc_timer, expires); } } /* * Set the expiration time on a key. */ void key_set_expiry(struct key *key, time64_t expiry) { key->expiry = expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; key_schedule_gc(expiry); } } /* * Schedule a dead links collection run. */ void key_schedule_gc_links(void) { set_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags); schedule_work(&key_gc_work); } /* * Some key's cleanup time was met after it expired, so we need to get the * reaper to go through a cycle finding expired keys. */ static void key_gc_timer_func(struct timer_list *unused) { kenter(""); key_gc_next_run = TIME64_MAX; key_schedule_gc_links(); } /* * Reap keys of dead type. * * We use three flags to make sure we see three complete cycles of the garbage * collector: the first to mark keys of that type as being dead, the second to * collect dead links and the third to clean up the dead keys. We have to be * careful as there may already be a cycle in progress. * * The caller must be holding key_types_sem. */ void key_gc_keytype(struct key_type *ktype) { kenter("%s", ktype->name); key_gc_dead_keytype = ktype; set_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); smp_mb(); set_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags); kdebug("schedule"); schedule_work(&key_gc_work); kdebug("sleep"); wait_on_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE, TASK_UNINTERRUPTIBLE); key_gc_dead_keytype = NULL; kleave(""); } /* * Garbage collect a list of unreferenced, detached keys */ static noinline void key_gc_unused_keys(struct list_head *keys) { while (!list_empty(keys)) { struct key *key = list_entry(keys->next, struct key, graveyard_link); short state = key->state; list_del(&key->graveyard_link); kdebug("- %u", key->serial); key_check(key); #ifdef CONFIG_KEY_NOTIFICATIONS remove_watch_list(key->watchers, key->serial); key->watchers = NULL; #endif /* Throw away the key data if the key is instantiated */ if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); security_key_free(key); atomic_dec(&key->user->nkeys); if (state != KEY_IS_UNINSTANTIATED) atomic_dec(&key->user->nikeys); key_user_put(key->user); key_put_tag(key->domain_tag); kfree(key->description); memzero_explicit(key, sizeof(*key)); kmem_cache_free(key_jar, key); } } /* * Garbage collector for unused keys. * * This is done in process context so that we don't have to disable interrupts * all over the place. key_put() schedules this rather than trying to do the * cleanup itself, which means key_put() doesn't have to sleep. */ static void key_garbage_collector(struct work_struct *work) { static LIST_HEAD(graveyard); static u8 gc_state; /* Internal persistent state */ #define KEY_GC_REAP_AGAIN 0x01 /* - Need another cycle */ #define KEY_GC_REAPING_LINKS 0x02 /* - We need to reap links */ #define KEY_GC_REAPING_DEAD_1 0x10 /* - We need to mark dead keys */ #define KEY_GC_REAPING_DEAD_2 0x20 /* - We need to reap dead key links */ #define KEY_GC_REAPING_DEAD_3 0x40 /* - We need to reap dead keys */ #define KEY_GC_FOUND_DEAD_KEY 0x80 /* - We found at least one dead key */ struct rb_node *cursor; struct key *key; time64_t new_timer, limit, expiry; kenter("[%lx,%x]", key_gc_flags, gc_state); limit = ktime_get_real_seconds(); /* Work out what we're going to be doing in this pass */ gc_state &= KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2; gc_state <<= 1; if (test_and_clear_bit(KEY_GC_KEY_EXPIRED, &key_gc_flags)) gc_state |= KEY_GC_REAPING_LINKS; if (test_and_clear_bit(KEY_GC_REAP_KEYTYPE, &key_gc_flags)) gc_state |= KEY_GC_REAPING_DEAD_1; kdebug("new pass %x", gc_state); new_timer = TIME64_MAX; /* As only this function is permitted to remove things from the key * serial tree, if cursor is non-NULL then it will always point to a * valid node in the tree - even if lock got dropped. */ spin_lock(&key_serial_lock); cursor = rb_first(&key_serial_tree); continue_scanning: while (cursor) { key = rb_entry(cursor, struct key, serial_node); cursor = rb_next(cursor); if (refcount_read(&key->usage) == 0) goto found_unreferenced_key; if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) { if (key->type == key_gc_dead_keytype) { gc_state |= KEY_GC_FOUND_DEAD_KEY; set_bit(KEY_FLAG_DEAD, &key->flags); key->perm = 0; goto skip_dead_key; } else if (key->type == &key_type_keyring && key->restrict_link) { goto found_restricted_keyring; } } expiry = key->expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; if (expiry > limit && expiry < new_timer) { kdebug("will expire %x in %lld", key_serial(key), key->expiry - limit); new_timer = key->expiry; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) if (key->type == key_gc_dead_keytype) gc_state |= KEY_GC_FOUND_DEAD_KEY; if ((gc_state & KEY_GC_REAPING_LINKS) || unlikely(gc_state & KEY_GC_REAPING_DEAD_2)) { if (key->type == &key_type_keyring) goto found_keyring; } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) if (key->type == key_gc_dead_keytype) goto destroy_dead_key; skip_dead_key: if (spin_is_contended(&key_serial_lock) || need_resched()) goto contended; } contended: spin_unlock(&key_serial_lock); maybe_resched: if (cursor) { cond_resched(); spin_lock(&key_serial_lock); goto continue_scanning; } /* We've completed the pass. Set the timer if we need to and queue a * new cycle if necessary. We keep executing cycles until we find one * where we didn't reap any keys. */ kdebug("pass complete"); if (new_timer != TIME64_MAX) { new_timer += key_gc_delay; key_schedule_gc(new_timer); } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_2) || !list_empty(&graveyard)) { /* Make sure that all pending keyring payload destructions are * fulfilled and that people aren't now looking at dead or * dying keys that they don't have a reference upon or a link * to. */ kdebug("gc sync"); synchronize_rcu(); } if (!list_empty(&graveyard)) { kdebug("gc keys"); key_gc_unused_keys(&graveyard); } if (unlikely(gc_state & (KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2))) { if (!(gc_state & KEY_GC_FOUND_DEAD_KEY)) { /* No remaining dead keys: short circuit the remaining * keytype reap cycles. */ kdebug("dead short"); gc_state &= ~(KEY_GC_REAPING_DEAD_1 | KEY_GC_REAPING_DEAD_2); gc_state |= KEY_GC_REAPING_DEAD_3; } else { gc_state |= KEY_GC_REAP_AGAIN; } } if (unlikely(gc_state & KEY_GC_REAPING_DEAD_3)) { kdebug("dead wake"); smp_mb(); clear_bit(KEY_GC_REAPING_KEYTYPE, &key_gc_flags); wake_up_bit(&key_gc_flags, KEY_GC_REAPING_KEYTYPE); } if (gc_state & KEY_GC_REAP_AGAIN) schedule_work(&key_gc_work); kleave(" [end %x]", gc_state); return; /* We found an unreferenced key - once we've removed it from the tree, * we can safely drop the lock. */ found_unreferenced_key: kdebug("unrefd key %d", key->serial); rb_erase(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); list_add_tail(&key->graveyard_link, &graveyard); gc_state |= KEY_GC_REAP_AGAIN; goto maybe_resched; /* We found a restricted keyring and need to update the restriction if * it is associated with the dead key type. */ found_restricted_keyring: spin_unlock(&key_serial_lock); keyring_restriction_gc(key, key_gc_dead_keytype); goto maybe_resched; /* We found a keyring and we need to check the payload for links to * dead or expired keys. We don't flag another reap immediately as we * have to wait for the old payload to be destroyed by RCU before we * can reap the keys to which it refers. */ found_keyring: spin_unlock(&key_serial_lock); keyring_gc(key, limit); goto maybe_resched; /* We found a dead key that is still referenced. Reset its type and * destroy its payload with its semaphore held. */ destroy_dead_key: spin_unlock(&key_serial_lock); kdebug("destroy key %d", key->serial); down_write(&key->sem); key->type = &key_type_dead; if (key_gc_dead_keytype->destroy) key_gc_dead_keytype->destroy(key); memset(&key->payload, KEY_DESTROY, sizeof(key->payload)); up_write(&key->sem); goto maybe_resched; }
20 15 15 21 3 7 7 2 1 1 1 4 2 3 3 4 1 6 6 4 4 1 3 3 9 9 7 1 6 2 1 4 3 3 2 1 3 4 11 2 1 1 1 1 3 2 5 5 4 1 2 3 1 1 1 5 5 5 5 2 5 5 2 5 5 3 5 7 6 6 1 5 5 5 4 2 5 5 5 3 5 5 2 5 5 5 28 28 5 3 7 6 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include <linux/file.h> #include <linux/interval_tree.h> #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/slab.h> #include <linux/vfio.h> #include <uapi/linux/vfio.h> #include <uapi/linux/iommufd.h> #include "iommufd_private.h" static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) { struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); xa_lock(&ictx->objects); if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) goto out_unlock; ioas = ictx->vfio_ioas; out_unlock: xa_unlock(&ictx->objects); return ioas; } /** * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists * @ictx: Context to operate on * @out_ioas_id: The IOAS ID of the compatibility IOAS * * Return the ID of the current compatibility IOAS. The ID can be passed into * other functions that take an ioas_id. */ int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) { struct iommufd_ioas *ioas; ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); *out_ioas_id = ioas->obj.id; iommufd_put_object(ictx, &ioas->obj); return 0; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, "IOMMUFD_VFIO"); /** * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached * @ictx: Context to operate on * * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types. */ int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) { int ret; xa_lock(&ictx->objects); if (!ictx->vfio_ioas) { ictx->no_iommu_mode = 1; ret = 0; } else { ret = -EINVAL; } xa_unlock(&ictx->objects); return ret; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, "IOMMUFD_VFIO"); /** * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created * @ictx: Context to operate on * * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate * on since they do not have an IOAS ID input in their ABI. Only attaching a * group should cause a default creation of the internal ioas, this does nothing * if an existing ioas has already been assigned somehow. */ int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx) { struct iommufd_ioas *ioas = NULL; int ret; ioas = iommufd_ioas_alloc(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); xa_lock(&ictx->objects); /* * VFIO won't allow attaching a container to both iommu and no iommu * operation */ if (ictx->no_iommu_mode) { ret = -EINVAL; goto out_abort; } if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) { ret = 0; iommufd_put_object(ictx, &ictx->vfio_ioas->obj); goto out_abort; } ictx->vfio_ioas = ioas; xa_unlock(&ictx->objects); /* * An automatically created compat IOAS is treated as a userspace * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, * and if not manually destroyed it will be destroyed automatically * at iommufd release. */ iommufd_object_finalize(ictx, &ioas->obj); return 0; out_abort: xa_unlock(&ictx->objects); iommufd_object_abort(ictx, &ioas->obj); return ret; } EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, "IOMMUFD_VFIO"); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) { struct iommu_vfio_ioas *cmd = ucmd->cmd; struct iommufd_ioas *ioas; if (cmd->__reserved) return -EOPNOTSUPP; switch (cmd->op) { case IOMMU_VFIO_IOAS_GET: ioas = get_compat_ioas(ucmd->ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); cmd->ioas_id = ioas->obj.id; iommufd_put_object(ucmd->ictx, &ioas->obj); return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); case IOMMU_VFIO_IOAS_SET: ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); if (IS_ERR(ioas)) return PTR_ERR(ioas); xa_lock(&ucmd->ictx->objects); ucmd->ictx->vfio_ioas = ioas; xa_unlock(&ucmd->ictx->objects); iommufd_put_object(ucmd->ictx, &ioas->obj); return 0; case IOMMU_VFIO_IOAS_CLEAR: xa_lock(&ucmd->ictx->objects); ucmd->ictx->vfio_ioas = NULL; xa_unlock(&ucmd->ictx->objects); return 0; default: return -EOPNOTSUPP; } } static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, void __user *arg) { u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); struct vfio_iommu_type1_dma_map map; int iommu_prot = IOMMU_CACHE; struct iommufd_ioas *ioas; unsigned long iova; int rc; if (copy_from_user(&map, arg, minsz)) return -EFAULT; if (map.argsz < minsz || map.flags & ~supported_flags) return -EINVAL; if (map.flags & VFIO_DMA_MAP_FLAG_READ) iommu_prot |= IOMMU_READ; if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) iommu_prot |= IOMMU_WRITE; ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); /* * Maps created through the legacy interface always use VFIO compatible * rlimit accounting. If the user wishes to use the faster user based * rlimit accounting then they must use the new interface. */ iova = map.iova; rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), map.size, iommu_prot, 0); iommufd_put_object(ictx, &ioas->obj); return rc; } static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, void __user *arg) { size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); /* * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new * dirty tracking direction: * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ */ u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; struct vfio_iommu_type1_dma_unmap unmap; unsigned long unmapped = 0; struct iommufd_ioas *ioas; int rc; if (copy_from_user(&unmap, arg, minsz)) return -EFAULT; if (unmap.argsz < minsz || unmap.flags & ~supported_flags) return -EINVAL; ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { if (unmap.iova != 0 || unmap.size != 0) { rc = -EINVAL; goto err_put; } rc = iopt_unmap_all(&ioas->iopt, &unmapped); } else { if (READ_ONCE(ioas->iopt.disable_large_pages)) { /* * Create cuts at the start and last of the requested * range. If the start IOVA is 0 then it doesn't need to * be cut. */ unsigned long iovas[] = { unmap.iova + unmap.size - 1, unmap.iova - 1 }; rc = iopt_cut_iova(&ioas->iopt, iovas, unmap.iova ? 2 : 1); if (rc) goto err_put; } rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, &unmapped); } unmap.size = unmapped; if (copy_to_user(arg, &unmap, minsz)) rc = -EFAULT; err_put: iommufd_put_object(ictx, &ioas->obj); return rc; } static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) { struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = 1; ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); mutex_lock(&ioas->mutex); list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { if (!hwpt_paging->enforce_cache_coherency) { rc = 0; break; } } mutex_unlock(&ioas->mutex); iommufd_put_object(ictx, &ioas->obj); return rc; } static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, unsigned long type) { switch (type) { case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_IOMMU: case VFIO_UNMAP_ALL: return 1; case VFIO_NOIOMMU_IOMMU: return IS_ENABLED(CONFIG_VFIO_NOIOMMU); case VFIO_DMA_CC_IOMMU: return iommufd_vfio_cc_iommu(ictx); case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: return 0; /* * VFIO_DMA_MAP_FLAG_VADDR * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ * * It is hard to see how this could be implemented safely. */ case VFIO_UPDATE_VADDR: default: return 0; } } static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) { bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode); struct iommufd_ioas *ioas = NULL; int rc = 0; /* * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all * other ioctls. We let them keep working but they mostly fail since no * IOAS should exist. */ if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU && no_iommu_mode) { if (!capable(CAP_SYS_RAWIO)) return -EPERM; return 0; } if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) || no_iommu_mode) return -EINVAL; /* VFIO fails the set_iommu if there is no group */ ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); /* * The difference between TYPE1 and TYPE1v2 is the ability to unmap in * the middle of mapped ranges. This is complicated by huge page support * which creates single large IOPTEs that cannot be split by the iommu * driver. TYPE1 is very old at this point and likely nothing uses it, * however it is simple enough to emulate by simply disabling the * problematic large IOPTEs. Then we can safely unmap within any range. */ if (type == VFIO_TYPE1_IOMMU) rc = iopt_disable_large_pages(&ioas->iopt); iommufd_put_object(ictx, &ioas->obj); return rc; } static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) { struct io_pagetable *iopt = &ioas->iopt; unsigned long pgsize_bitmap = ULONG_MAX; struct iommu_domain *domain; unsigned long index; down_read(&iopt->domains_rwsem); xa_for_each(&iopt->domains, index, domain) pgsize_bitmap &= domain->pgsize_bitmap; /* See vfio_update_pgsize_bitmap() */ if (pgsize_bitmap & ~PAGE_MASK) { pgsize_bitmap &= PAGE_MASK; pgsize_bitmap |= PAGE_SIZE; } pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); up_read(&iopt->domains_rwsem); return pgsize_bitmap; } static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, struct vfio_info_cap_header __user *cur, size_t avail) { struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = container_of(cur, struct vfio_iommu_type1_info_cap_iova_range __user, header); struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { .header = { .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, .version = 1, }, }; struct interval_tree_span_iter span; interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, ULONG_MAX) { struct vfio_iova_range range; if (!span.is_hole) continue; range.start = span.start_hole; range.end = span.last_hole; if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas + 1) && copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], &range, sizeof(range))) return -EFAULT; cap_iovas.nr_iovas++; } if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) return -EFAULT; return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); } static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, struct vfio_info_cap_header __user *cur, size_t avail) { struct vfio_iommu_type1_info_dma_avail cap_dma = { .header = { .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, .version = 1, }, /* * iommufd's limit is based on the cgroup's memory limit. * Normally vfio would return U16_MAX here, and provide a module * parameter to adjust it. Since S390 qemu userspace actually * pays attention and needs a value bigger than U16_MAX return * U32_MAX. */ .avail = U32_MAX, }; if (avail >= sizeof(cap_dma) && copy_to_user(cur, &cap_dma, sizeof(cap_dma))) return -EFAULT; return sizeof(cap_dma); } static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, void __user *arg) { typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, struct vfio_info_cap_header __user *cur, size_t avail); static const fill_cap_fn fill_fns[] = { iommufd_fill_cap_dma_avail, iommufd_fill_cap_iova, }; size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); struct vfio_info_cap_header __user *last_cap = NULL; struct vfio_iommu_type1_info info = {}; struct iommufd_ioas *ioas; size_t total_cap_size; int rc; int i; if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; minsz = min_t(size_t, info.argsz, sizeof(info)); ioas = get_compat_ioas(ictx); if (IS_ERR(ioas)) return PTR_ERR(ioas); info.flags = VFIO_IOMMU_INFO_PGSIZES; info.iova_pgsizes = iommufd_get_pagesizes(ioas); info.cap_offset = 0; down_read(&ioas->iopt.iova_rwsem); total_cap_size = sizeof(info); for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { int cap_size; if (info.argsz > total_cap_size) cap_size = fill_fns[i](ioas, arg + total_cap_size, info.argsz - total_cap_size); else cap_size = fill_fns[i](ioas, NULL, 0); if (cap_size < 0) { rc = cap_size; goto out_put; } cap_size = ALIGN(cap_size, sizeof(u64)); if (last_cap && info.argsz >= total_cap_size && put_user(total_cap_size, &last_cap->next)) { rc = -EFAULT; goto out_put; } last_cap = arg + total_cap_size; total_cap_size += cap_size; } /* * If the user did not provide enough space then only some caps are * returned and the argsz will be updated to the correct amount to get * all caps. */ if (info.argsz >= total_cap_size) info.cap_offset = sizeof(info); info.argsz = total_cap_size; info.flags |= VFIO_IOMMU_INFO_CAPS; if (copy_to_user(arg, &info, minsz)) { rc = -EFAULT; goto out_put; } rc = 0; out_put: up_read(&ioas->iopt.iova_rwsem); iommufd_put_object(ictx, &ioas->obj); return rc; } int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, unsigned long arg) { void __user *uarg = (void __user *)arg; switch (cmd) { case VFIO_GET_API_VERSION: return VFIO_API_VERSION; case VFIO_SET_IOMMU: return iommufd_vfio_set_iommu(ictx, arg); case VFIO_CHECK_EXTENSION: return iommufd_vfio_check_extension(ictx, arg); case VFIO_IOMMU_GET_INFO: return iommufd_vfio_iommu_get_info(ictx, uarg); case VFIO_IOMMU_MAP_DMA: return iommufd_vfio_map_dma(ictx, cmd, uarg); case VFIO_IOMMU_UNMAP_DMA: return iommufd_vfio_unmap_dma(ictx, cmd, uarg); case VFIO_IOMMU_DIRTY_PAGES: default: return -ENOIOCTLCMD; } return -ENOIOCTLCMD; }
13 13 2 13 3 3 1 10 4 4 2 3 2 2 2 1 3 3 3 3 1 3 3 3 2 2 2 1 1 1 1 1 1 8 8 8 8 8 8 8 8 8 8 8 8 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 15 14 12 4 11 7 1 1 1 5 7 9 4 3 5 5 4 1 1 3 4 3 3 3 4 2 2 2 2 1 3 15 5 5 4 3 3 2 2 2 4 31 31 1 1 3 2 1 4 1 2 1 1 1 2 2 1 1 2 2 2 1 2 2 1 1 15 13 15 1 31 39 192 168 26 24 26 32 32 32 192 38 1 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Paul Stewart * Copyright (c) 2001 Vojtech Pavlik * * HID char devices, giving access to raw HID device events. */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to Paul Stewart <stewart@wetlogic.net> */ #include <linux/poll.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input.h> #include <linux/usb.h> #include <linux/hid.h> #include <linux/hiddev.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/nospec.h> #include "usbhid.h" #ifdef CONFIG_USB_DYNAMIC_MINORS #define HIDDEV_MINOR_BASE 0 #define HIDDEV_MINORS 256 #else #define HIDDEV_MINOR_BASE 96 #define HIDDEV_MINORS 16 #endif #define HIDDEV_BUFFER_SIZE 2048 struct hiddev_list { struct hiddev_usage_ref buffer[HIDDEV_BUFFER_SIZE]; int head; int tail; unsigned flags; struct fasync_struct *fasync; struct hiddev *hiddev; struct list_head node; struct mutex thread_lock; }; /* * Find a report, given the report's type and ID. The ID can be specified * indirectly by REPORT_ID_FIRST (which returns the first report of the given * type) or by (REPORT_ID_NEXT | old_id), which returns the next report of the * given type which follows old_id. */ static struct hid_report * hiddev_lookup_report(struct hid_device *hid, struct hiddev_report_info *rinfo) { unsigned int flags = rinfo->report_id & ~HID_REPORT_ID_MASK; unsigned int rid = rinfo->report_id & HID_REPORT_ID_MASK; struct hid_report_enum *report_enum; struct hid_report *report; struct list_head *list; if (rinfo->report_type < HID_REPORT_TYPE_MIN || rinfo->report_type > HID_REPORT_TYPE_MAX) return NULL; report_enum = hid->report_enum + (rinfo->report_type - HID_REPORT_TYPE_MIN); switch (flags) { case 0: /* Nothing to do -- report_id is already set correctly */ break; case HID_REPORT_ID_FIRST: if (list_empty(&report_enum->report_list)) return NULL; list = report_enum->report_list.next; report = list_entry(list, struct hid_report, list); rinfo->report_id = report->id; break; case HID_REPORT_ID_NEXT: report = report_enum->report_id_hash[rid]; if (!report) return NULL; list = report->list.next; if (list == &report_enum->report_list) return NULL; report = list_entry(list, struct hid_report, list); rinfo->report_id = report->id; break; default: return NULL; } return report_enum->report_id_hash[rinfo->report_id]; } /* * Perform an exhaustive search of the report table for a usage, given its * type and usage id. */ static struct hid_field * hiddev_lookup_usage(struct hid_device *hid, struct hiddev_usage_ref *uref) { int i, j; struct hid_report *report; struct hid_report_enum *report_enum; struct hid_field *field; if (uref->report_type < HID_REPORT_TYPE_MIN || uref->report_type > HID_REPORT_TYPE_MAX) return NULL; report_enum = hid->report_enum + (uref->report_type - HID_REPORT_TYPE_MIN); list_for_each_entry(report, &report_enum->report_list, list) { for (i = 0; i < report->maxfield; i++) { field = report->field[i]; for (j = 0; j < field->maxusage; j++) { if (field->usage[j].hid == uref->usage_code) { uref->report_id = report->id; uref->field_index = i; uref->usage_index = j; return field; } } } } return NULL; } static void hiddev_send_event(struct hid_device *hid, struct hiddev_usage_ref *uref) { struct hiddev *hiddev = hid->hiddev; struct hiddev_list *list; unsigned long flags; spin_lock_irqsave(&hiddev->list_lock, flags); list_for_each_entry(list, &hiddev->list, node) { if (uref->field_index != HID_FIELD_INDEX_NONE || (list->flags & HIDDEV_FLAG_REPORT) != 0) { list->buffer[list->head] = *uref; list->head = (list->head + 1) & (HIDDEV_BUFFER_SIZE - 1); kill_fasync(&list->fasync, SIGIO, POLL_IN); } } spin_unlock_irqrestore(&hiddev->list_lock, flags); wake_up_interruptible(&hiddev->wait); } /* * This is where hid.c calls into hiddev to pass an event that occurred over * the interrupt pipe */ void hiddev_hid_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value) { unsigned type = field->report_type; struct hiddev_usage_ref uref; uref.report_type = (type == HID_INPUT_REPORT) ? HID_REPORT_TYPE_INPUT : ((type == HID_OUTPUT_REPORT) ? HID_REPORT_TYPE_OUTPUT : ((type == HID_FEATURE_REPORT) ? HID_REPORT_TYPE_FEATURE : 0)); uref.report_id = field->report->id; uref.field_index = field->index; uref.usage_index = (usage - field->usage); uref.usage_code = usage->hid; uref.value = value; hiddev_send_event(hid, &uref); } EXPORT_SYMBOL_GPL(hiddev_hid_event); void hiddev_report_event(struct hid_device *hid, struct hid_report *report) { unsigned type = report->type; struct hiddev_usage_ref uref; memset(&uref, 0, sizeof(uref)); uref.report_type = (type == HID_INPUT_REPORT) ? HID_REPORT_TYPE_INPUT : ((type == HID_OUTPUT_REPORT) ? HID_REPORT_TYPE_OUTPUT : ((type == HID_FEATURE_REPORT) ? HID_REPORT_TYPE_FEATURE : 0)); uref.report_id = report->id; uref.field_index = HID_FIELD_INDEX_NONE; hiddev_send_event(hid, &uref); } /* * fasync file op */ static int hiddev_fasync(int fd, struct file *file, int on) { struct hiddev_list *list = file->private_data; return fasync_helper(fd, file, on, &list->fasync); } /* * release file op */ static int hiddev_release(struct inode * inode, struct file * file) { struct hiddev_list *list = file->private_data; unsigned long flags; spin_lock_irqsave(&list->hiddev->list_lock, flags); list_del(&list->node); spin_unlock_irqrestore(&list->hiddev->list_lock, flags); mutex_lock(&list->hiddev->existancelock); if (!--list->hiddev->open) { if (list->hiddev->exist) { hid_hw_close(list->hiddev->hid); hid_hw_power(list->hiddev->hid, PM_HINT_NORMAL); } else { mutex_unlock(&list->hiddev->existancelock); kfree(list->hiddev); vfree(list); return 0; } } mutex_unlock(&list->hiddev->existancelock); vfree(list); return 0; } static int __hiddev_open(struct hiddev *hiddev, struct file *file) { struct hiddev_list *list; int error; lockdep_assert_held(&hiddev->existancelock); list = vzalloc(sizeof(*list)); if (!list) return -ENOMEM; mutex_init(&list->thread_lock); list->hiddev = hiddev; if (!hiddev->open++) { error = hid_hw_power(hiddev->hid, PM_HINT_FULLON); if (error < 0) goto err_drop_count; error = hid_hw_open(hiddev->hid); if (error < 0) goto err_normal_power; } spin_lock_irq(&hiddev->list_lock); list_add_tail(&list->node, &hiddev->list); spin_unlock_irq(&hiddev->list_lock); file->private_data = list; return 0; err_normal_power: hid_hw_power(hiddev->hid, PM_HINT_NORMAL); err_drop_count: hiddev->open--; vfree(list); return error; } /* * open file op */ static int hiddev_open(struct inode *inode, struct file *file) { struct usb_interface *intf; struct hid_device *hid; struct hiddev *hiddev; int res; intf = usbhid_find_interface(iminor(inode)); if (!intf) return -ENODEV; hid = usb_get_intfdata(intf); hiddev = hid->hiddev; mutex_lock(&hiddev->existancelock); res = hiddev->exist ? __hiddev_open(hiddev, file) : -ENODEV; mutex_unlock(&hiddev->existancelock); return res; } /* * "write" file op */ static ssize_t hiddev_write(struct file * file, const char __user * buffer, size_t count, loff_t *ppos) { return -EINVAL; } /* * "read" file op */ static ssize_t hiddev_read(struct file * file, char __user * buffer, size_t count, loff_t *ppos) { DEFINE_WAIT(wait); struct hiddev_list *list = file->private_data; int event_size; int retval; event_size = ((list->flags & HIDDEV_FLAG_UREF) != 0) ? sizeof(struct hiddev_usage_ref) : sizeof(struct hiddev_event); if (count < event_size) return 0; /* lock against other threads */ retval = mutex_lock_interruptible(&list->thread_lock); if (retval) return -ERESTARTSYS; while (retval == 0) { if (list->head == list->tail) { prepare_to_wait(&list->hiddev->wait, &wait, TASK_INTERRUPTIBLE); while (list->head == list->tail) { if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (!list->hiddev->exist) { retval = -EIO; break; } if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; break; } /* let O_NONBLOCK tasks run */ mutex_unlock(&list->thread_lock); schedule(); if (mutex_lock_interruptible(&list->thread_lock)) { finish_wait(&list->hiddev->wait, &wait); return -EINTR; } set_current_state(TASK_INTERRUPTIBLE); } finish_wait(&list->hiddev->wait, &wait); } if (retval) { mutex_unlock(&list->thread_lock); return retval; } while (list->head != list->tail && retval + event_size <= count) { if ((list->flags & HIDDEV_FLAG_UREF) == 0) { if (list->buffer[list->tail].field_index != HID_FIELD_INDEX_NONE) { struct hiddev_event event; event.hid = list->buffer[list->tail].usage_code; event.value = list->buffer[list->tail].value; if (copy_to_user(buffer + retval, &event, sizeof(struct hiddev_event))) { mutex_unlock(&list->thread_lock); return -EFAULT; } retval += sizeof(struct hiddev_event); } } else { if (list->buffer[list->tail].field_index != HID_FIELD_INDEX_NONE || (list->flags & HIDDEV_FLAG_REPORT) != 0) { if (copy_to_user(buffer + retval, list->buffer + list->tail, sizeof(struct hiddev_usage_ref))) { mutex_unlock(&list->thread_lock); return -EFAULT; } retval += sizeof(struct hiddev_usage_ref); } } list->tail = (list->tail + 1) & (HIDDEV_BUFFER_SIZE - 1); } } mutex_unlock(&list->thread_lock); return retval; } /* * "poll" file op * No kernel lock - fine */ static __poll_t hiddev_poll(struct file *file, poll_table *wait) { struct hiddev_list *list = file->private_data; poll_wait(file, &list->hiddev->wait, wait); if (list->head != list->tail) return EPOLLIN | EPOLLRDNORM | EPOLLOUT; if (!list->hiddev->exist) return EPOLLERR | EPOLLHUP; return 0; } /* * "ioctl" file op */ static noinline int hiddev_ioctl_usage(struct hiddev *hiddev, unsigned int cmd, void __user *user_arg) { struct hid_device *hid = hiddev->hid; struct hiddev_report_info rinfo; struct hiddev_usage_ref_multi *uref_multi = NULL; struct hiddev_usage_ref *uref; struct hid_report *report; struct hid_field *field; int i; uref_multi = kmalloc(sizeof(struct hiddev_usage_ref_multi), GFP_KERNEL); if (!uref_multi) return -ENOMEM; uref = &uref_multi->uref; if (cmd == HIDIOCGUSAGES || cmd == HIDIOCSUSAGES) { if (copy_from_user(uref_multi, user_arg, sizeof(*uref_multi))) goto fault; } else { if (copy_from_user(uref, user_arg, sizeof(*uref))) goto fault; } switch (cmd) { case HIDIOCGUCODE: rinfo.report_type = uref->report_type; rinfo.report_id = uref->report_id; if ((report = hiddev_lookup_report(hid, &rinfo)) == NULL) goto inval; if (uref->field_index >= report->maxfield) goto inval; uref->field_index = array_index_nospec(uref->field_index, report->maxfield); field = report->field[uref->field_index]; if (uref->usage_index >= field->maxusage) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->maxusage); uref->usage_code = field->usage[uref->usage_index].hid; if (copy_to_user(user_arg, uref, sizeof(*uref))) goto fault; goto goodreturn; default: if (cmd != HIDIOCGUSAGE && cmd != HIDIOCGUSAGES && uref->report_type == HID_REPORT_TYPE_INPUT) goto inval; if (uref->report_id == HID_REPORT_ID_UNKNOWN) { field = hiddev_lookup_usage(hid, uref); if (field == NULL) goto inval; } else { rinfo.report_type = uref->report_type; rinfo.report_id = uref->report_id; if ((report = hiddev_lookup_report(hid, &rinfo)) == NULL) goto inval; if (uref->field_index >= report->maxfield) goto inval; uref->field_index = array_index_nospec(uref->field_index, report->maxfield); field = report->field[uref->field_index]; if (cmd == HIDIOCGCOLLECTIONINDEX) { if (uref->usage_index >= field->maxusage) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->maxusage); } else if (uref->usage_index >= field->report_count) goto inval; } if (cmd == HIDIOCGUSAGES || cmd == HIDIOCSUSAGES) { if (uref_multi->num_values > HID_MAX_MULTI_USAGES || uref->usage_index + uref_multi->num_values > field->report_count) goto inval; uref->usage_index = array_index_nospec(uref->usage_index, field->report_count - uref_multi->num_values); } switch (cmd) { case HIDIOCGUSAGE: if (uref->usage_index >= field->report_count) goto inval; uref->value = field->value[uref->usage_index]; if (copy_to_user(user_arg, uref, sizeof(*uref))) goto fault; goto goodreturn; case HIDIOCSUSAGE: if (uref->usage_index >= field->report_count) goto inval; field->value[uref->usage_index] = uref->value; goto goodreturn; case HIDIOCGCOLLECTIONINDEX: i = field->usage[uref->usage_index].collection_index; kfree(uref_multi); return i; case HIDIOCGUSAGES: for (i = 0; i < uref_multi->num_values; i++) uref_multi->values[i] = field->value[uref->usage_index + i]; if (copy_to_user(user_arg, uref_multi, sizeof(*uref_multi))) goto fault; goto goodreturn; case HIDIOCSUSAGES: for (i = 0; i < uref_multi->num_values; i++) field->value[uref->usage_index + i] = uref_multi->values[i]; goto goodreturn; } goodreturn: kfree(uref_multi); return 0; fault: kfree(uref_multi); return -EFAULT; inval: kfree(uref_multi); return -EINVAL; } } static noinline int hiddev_ioctl_string(struct hiddev *hiddev, unsigned int cmd, void __user *user_arg) { struct hid_device *hid = hiddev->hid; struct usb_device *dev = hid_to_usb_dev(hid); int idx, len; char *buf; if (get_user(idx, (int __user *)user_arg)) return -EFAULT; if ((buf = kmalloc(HID_STRING_SIZE, GFP_KERNEL)) == NULL) return -ENOMEM; if ((len = usb_string(dev, idx, buf, HID_STRING_SIZE-1)) < 0) { kfree(buf); return -EINVAL; } if (copy_to_user(user_arg+sizeof(int), buf, len+1)) { kfree(buf); return -EFAULT; } kfree(buf); return len; } static long hiddev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct hiddev_list *list = file->private_data; struct hiddev *hiddev = list->hiddev; struct hid_device *hid; struct hiddev_collection_info cinfo; struct hiddev_report_info rinfo; struct hiddev_field_info finfo; struct hiddev_devinfo dinfo; struct hid_report *report; struct hid_field *field; void __user *user_arg = (void __user *)arg; int i, r = -EINVAL; /* Called without BKL by compat methods so no BKL taken */ mutex_lock(&hiddev->existancelock); if (!hiddev->exist) { r = -ENODEV; goto ret_unlock; } hid = hiddev->hid; switch (cmd) { case HIDIOCGVERSION: r = put_user(HID_VERSION, (int __user *)arg) ? -EFAULT : 0; break; case HIDIOCAPPLICATION: if (arg >= hid->maxapplication) break; for (i = 0; i < hid->maxcollection; i++) if (hid->collection[i].type == HID_COLLECTION_APPLICATION && arg-- == 0) break; if (i < hid->maxcollection) r = hid->collection[i].usage; break; case HIDIOCGDEVINFO: { struct usb_device *dev = hid_to_usb_dev(hid); struct usbhid_device *usbhid = hid->driver_data; memset(&dinfo, 0, sizeof(dinfo)); dinfo.bustype = BUS_USB; dinfo.busnum = dev->bus->busnum; dinfo.devnum = dev->devnum; dinfo.ifnum = usbhid->ifnum; dinfo.vendor = le16_to_cpu(dev->descriptor.idVendor); dinfo.product = le16_to_cpu(dev->descriptor.idProduct); dinfo.version = le16_to_cpu(dev->descriptor.bcdDevice); dinfo.num_applications = hid->maxapplication; r = copy_to_user(user_arg, &dinfo, sizeof(dinfo)) ? -EFAULT : 0; break; } case HIDIOCGFLAG: r = put_user(list->flags, (int __user *)arg) ? -EFAULT : 0; break; case HIDIOCSFLAG: { int newflags; if (get_user(newflags, (int __user *)arg)) { r = -EFAULT; break; } if ((newflags & ~HIDDEV_FLAGS) != 0 || ((newflags & HIDDEV_FLAG_REPORT) != 0 && (newflags & HIDDEV_FLAG_UREF) == 0)) break; list->flags = newflags; r = 0; break; } case HIDIOCGSTRING: r = hiddev_ioctl_string(hiddev, cmd, user_arg); break; case HIDIOCINITREPORT: usbhid_init_reports(hid); hiddev->initialized = true; r = 0; break; case HIDIOCGREPORT: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } if (rinfo.report_type == HID_REPORT_TYPE_OUTPUT) break; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; hid_hw_request(hid, report, HID_REQ_GET_REPORT); hid_hw_wait(hid); r = 0; break; case HIDIOCSREPORT: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } if (rinfo.report_type == HID_REPORT_TYPE_INPUT) break; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; hid_hw_request(hid, report, HID_REQ_SET_REPORT); hid_hw_wait(hid); r = 0; break; case HIDIOCGREPORTINFO: if (copy_from_user(&rinfo, user_arg, sizeof(rinfo))) { r = -EFAULT; break; } report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; rinfo.num_fields = report->maxfield; r = copy_to_user(user_arg, &rinfo, sizeof(rinfo)) ? -EFAULT : 0; break; case HIDIOCGFIELDINFO: if (copy_from_user(&finfo, user_arg, sizeof(finfo))) { r = -EFAULT; break; } rinfo.report_type = finfo.report_type; rinfo.report_id = finfo.report_id; report = hiddev_lookup_report(hid, &rinfo); if (report == NULL) break; if (finfo.field_index >= report->maxfield) break; finfo.field_index = array_index_nospec(finfo.field_index, report->maxfield); field = report->field[finfo.field_index]; memset(&finfo, 0, sizeof(finfo)); finfo.report_type = rinfo.report_type; finfo.report_id = rinfo.report_id; finfo.field_index = field->report_count - 1; finfo.maxusage = field->maxusage; finfo.flags = field->flags; finfo.physical = field->physical; finfo.logical = field->logical; finfo.application = field->application; finfo.logical_minimum = field->logical_minimum; finfo.logical_maximum = field->logical_maximum; finfo.physical_minimum = field->physical_minimum; finfo.physical_maximum = field->physical_maximum; finfo.unit_exponent = field->unit_exponent; finfo.unit = field->unit; r = copy_to_user(user_arg, &finfo, sizeof(finfo)) ? -EFAULT : 0; break; case HIDIOCGUCODE: case HIDIOCGUSAGE: case HIDIOCSUSAGE: case HIDIOCGUSAGES: case HIDIOCSUSAGES: case HIDIOCGCOLLECTIONINDEX: if (!hiddev->initialized) { usbhid_init_reports(hid); hiddev->initialized = true; } r = hiddev_ioctl_usage(hiddev, cmd, user_arg); break; case HIDIOCGCOLLECTIONINFO: if (copy_from_user(&cinfo, user_arg, sizeof(cinfo))) { r = -EFAULT; break; } if (cinfo.index >= hid->maxcollection) break; cinfo.index = array_index_nospec(cinfo.index, hid->maxcollection); cinfo.type = hid->collection[cinfo.index].type; cinfo.usage = hid->collection[cinfo.index].usage; cinfo.level = hid->collection[cinfo.index].level; r = copy_to_user(user_arg, &cinfo, sizeof(cinfo)) ? -EFAULT : 0; break; default: if (_IOC_TYPE(cmd) != 'H' || _IOC_DIR(cmd) != _IOC_READ) break; if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGNAME(0))) { int len = strlen(hid->name) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); r = copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; break; } if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGPHYS(0))) { int len = strlen(hid->phys) + 1; if (len > _IOC_SIZE(cmd)) len = _IOC_SIZE(cmd); r = copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; break; } } ret_unlock: mutex_unlock(&hiddev->existancelock); return r; } static const struct file_operations hiddev_fops = { .owner = THIS_MODULE, .read = hiddev_read, .write = hiddev_write, .poll = hiddev_poll, .open = hiddev_open, .release = hiddev_release, .unlocked_ioctl = hiddev_ioctl, .fasync = hiddev_fasync, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; static char *hiddev_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } static struct usb_class_driver hiddev_class = { .name = "hiddev%d", .devnode = hiddev_devnode, .fops = &hiddev_fops, .minor_base = HIDDEV_MINOR_BASE, }; /* * This is where hid.c calls us to connect a hid device to the hiddev driver */ int hiddev_connect(struct hid_device *hid, unsigned int force) { struct hiddev *hiddev; struct usbhid_device *usbhid = hid->driver_data; int retval; if (!force) { unsigned int i; for (i = 0; i < hid->maxcollection; i++) if (hid->collection[i].type == HID_COLLECTION_APPLICATION && !IS_INPUT_APPLICATION(hid->collection[i].usage)) break; if (i == hid->maxcollection) return -EINVAL; } if (!(hiddev = kzalloc(sizeof(struct hiddev), GFP_KERNEL))) return -ENOMEM; init_waitqueue_head(&hiddev->wait); INIT_LIST_HEAD(&hiddev->list); spin_lock_init(&hiddev->list_lock); mutex_init(&hiddev->existancelock); hid->hiddev = hiddev; hiddev->hid = hid; hiddev->exist = 1; retval = usb_register_dev(usbhid->intf, &hiddev_class); if (retval) { hid_err(hid, "Not able to get a minor for this device\n"); hid->hiddev = NULL; kfree(hiddev); return retval; } /* * If HID_QUIRK_NO_INIT_REPORTS is set, make sure we don't initialize * the reports. */ hiddev->initialized = hid->quirks & HID_QUIRK_NO_INIT_REPORTS; hiddev->minor = usbhid->intf->minor; return 0; } /* * This is where hid.c calls us to disconnect a hiddev device from the * corresponding hid device (usually because the usb device has disconnected) */ static struct usb_class_driver hiddev_class; void hiddev_disconnect(struct hid_device *hid) { struct hiddev *hiddev = hid->hiddev; struct usbhid_device *usbhid = hid->driver_data; usb_deregister_dev(usbhid->intf, &hiddev_class); mutex_lock(&hiddev->existancelock); hiddev->exist = 0; if (hiddev->open) { hid_hw_close(hiddev->hid); wake_up_interruptible(&hiddev->wait); mutex_unlock(&hiddev->existancelock); } else { mutex_unlock(&hiddev->existancelock); kfree(hiddev); } }
81 30 30 29 30 29 21 20 19 18 16 4 10 8 7 5 4 1 21 1 17 17 17 16 16 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 3 2 2 2 3 3 3 2 2 4 2 2 2 1 1 1 2 16 15 14 13 16 8 8 8 8 8 8 10 9 8 1 1 1 1 1 1 1 1 8 8 8 10 5 2 3 1 5 1 76 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/rculist.h> #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/netfilter.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static unsigned int nfct_timeout_id __read_mostly; struct ctnl_timeout { struct list_head head; struct list_head free_head; struct rcu_head rcu_head; refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; /* must be at the end */ struct nf_ct_timeout timeout; }; struct nfct_timeout_pernet { struct list_head nfct_timeout_list; struct list_head nfct_timeout_freelist; }; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING, .len = CTNL_TIMEOUT_NAME_MAX - 1}, [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 }, [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 }, [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, }; static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net) { return net_generic(net, nfct_timeout_id); } static int ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { struct nlattr **tb; int ret = 0; tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; ret = nla_parse_nested_deprecated(tb, l4proto->ctnl_timeout.nlattr_max, attr, l4proto->ctnl_timeout.nla_policy, NULL); if (ret < 0) goto err; ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout); err: kfree(tb); return ret; } static int cttimeout_new_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); __u16 l3num; __u8 l4num; const struct nf_conntrack_l4proto *l4proto; struct ctnl_timeout *timeout, *matching = NULL; char *name; int ret; if (!cda[CTA_TIMEOUT_NAME] || !cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = timeout; break; } if (matching) { if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ if (matching->timeout.l3num != l3num || matching->timeout.l4proto->l4proto != l4num) return -EINVAL; return ctnl_timeout_parse_policy(&matching->timeout.data, matching->timeout.l4proto, info->net, cda[CTA_TIMEOUT_DATA]); } return -EBUSY; } l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supportted, skip. */ if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err_proto_put; } timeout = kzalloc(sizeof(struct ctnl_timeout) + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); if (timeout == NULL) { ret = -ENOMEM; goto err_proto_put; } ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); timeout->timeout.l3num = l3num; timeout->timeout.l4proto = l4proto; refcount_set(&timeout->refcnt, 1); __module_get(THIS_MODULE); list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list); return 0; err: kfree(timeout); err_proto_put: return ret; } static int ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct ctnl_timeout *timeout) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->timeout.l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto) || nla_put_be32(skb, CTA_TIMEOUT_USE, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data); if (ret < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nfct_timeout_pernet *pernet; struct net *net = sock_net(skb->sk); struct ctnl_timeout *cur, *last; if (cb->args[2]) return 0; last = (struct ctnl_timeout *)cb->args[1]; if (cb->args[1]) cb->args[1] = 0; rcu_read_lock(); pernet = nfct_timeout_pernet(net); list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) { if (last) { if (cur != last) continue; last = NULL; } if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; break; } } if (!cb->args[1]) cb->args[2] = 1; rcu_read_unlock(); return skb->len; } static int cttimeout_get_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); int ret = -ENOENT; char *name; struct ctnl_timeout *cur; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnl_timeout_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!cda[CTA_TIMEOUT_NAME]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } /* try to delete object, fail if it is still in use. */ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) { int ret = 0; /* We want to avoid races with ctnl_timeout_put. So only when the * current refcnt is 1, we decrease it to 0. */ if (refcount_dec_if_one(&timeout->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_untimeout(net, &timeout->timeout); kfree_rcu(timeout, rcu_head); } else { ret = -EBUSY; } return ret; } static int cttimeout_del_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; if (!cda[CTA_TIMEOUT_NAME]) { list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) ctnl_timeout_try_del(info->net, cur); return 0; } name = nla_data(cda[CTA_TIMEOUT_NAME]); list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; ret = ctnl_timeout_try_del(info->net, cur); if (ret < 0) return ret; break; } return ret; } static int cttimeout_default_set(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; __u8 l4num; int ret; if (!cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supported, skip. */ if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err; } ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; return 0; err: return ret; } static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 l3num, const struct nf_conntrack_l4proto *l4proto, const unsigned int *timeouts) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); if (ret < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int cttimeout_default_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; __u16 l3num; __u8 l4num; int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) return -EOPNOTSUPP; switch (l4proto->l4proto) { case IPPROTO_ICMP: timeouts = &nf_icmp_pernet(info->net)->timeout; break; case IPPROTO_TCP: timeouts = nf_tcp_pernet(info->net)->timeouts; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(info->net)->timeouts; break; case IPPROTO_DCCP: #ifdef CONFIG_NF_CT_PROTO_DCCP timeouts = nf_dccp_pernet(info->net)->dccp_timeout; #endif break; case IPPROTO_ICMPV6: timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; case IPPROTO_SCTP: #ifdef CONFIG_NF_CT_PROTO_SCTP timeouts = nf_sctp_pernet(info->net)->timeouts; #endif break; case IPPROTO_GRE: #ifdef CONFIG_NF_CT_PROTO_GRE timeouts = nf_gre_pernet(info->net)->timeouts; #endif break; case 255: timeouts = &nf_generic_pernet(info->net)->timeout; break; default: WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); break; } if (!timeouts) return -EOPNOTSUPP; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; ret = cttimeout_default_fill_info(info->net, skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, const char *name) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *timeout, *matching = NULL; list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; if (!refcount_inc_not_zero(&timeout->refcnt)) goto err; matching = timeout; break; } err: return matching ? &matching->timeout : NULL; } static void ctnl_timeout_put(struct nf_ct_timeout *t) { struct ctnl_timeout *timeout = container_of(t, struct ctnl_timeout, timeout); if (refcount_dec_and_test(&timeout->refcnt)) { kfree_rcu(timeout, rcu_head); module_put(THIS_MODULE); } } static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = { .call = cttimeout_default_set, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = { .call = cttimeout_default_get, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, }; static const struct nfnetlink_subsystem cttimeout_subsys = { .name = "conntrack_timeout", .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT, .cb_count = IPCTNL_MSG_TIMEOUT_MAX, .cb = cttimeout_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); static int __net_init cttimeout_net_init(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); INIT_LIST_HEAD(&pernet->nfct_timeout_list); INIT_LIST_HEAD(&pernet->nfct_timeout_freelist); return 0; } static void __net_exit cttimeout_net_pre_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) { list_del_rcu(&cur->head); list_add(&cur->free_head, &pernet->nfct_timeout_freelist); } /* core calls synchronize_rcu() after this */ } static void __net_exit cttimeout_net_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; if (list_empty(&pernet->nfct_timeout_freelist)) return; nf_ct_untimeout(net, NULL); list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) { list_del(&cur->free_head); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); } } static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, .pre_exit = cttimeout_net_pre_exit, .exit = cttimeout_net_exit, .id = &nfct_timeout_id, .size = sizeof(struct nfct_timeout_pernet), }; static const struct nf_ct_timeout_hooks hooks = { .timeout_find_get = ctnl_timeout_find_get, .timeout_put = ctnl_timeout_put, }; static int __init cttimeout_init(void) { int ret; ret = register_pernet_subsys(&cttimeout_ops); if (ret < 0) return ret; ret = nfnetlink_subsys_register(&cttimeout_subsys); if (ret < 0) { pr_err("cttimeout_init: cannot register cttimeout with " "nfnetlink.\n"); goto err_out; } RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks); return 0; err_out: unregister_pernet_subsys(&cttimeout_ops); return ret; } static int untimeout(struct nf_conn *ct, void *timeout) { struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) RCU_INIT_POINTER(timeout_ext->timeout, NULL); return 0; } static void __exit cttimeout_exit(void) { nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); RCU_INIT_POINTER(nf_ct_timeout_hook, NULL); nf_ct_iterate_destroy(untimeout, NULL); } module_init(cttimeout_init); module_exit(cttimeout_exit);
86 65 21 31 16 15 21 21 21 13 8 20 21 21 21 21 21 20 66 65 65 64 21 64 64 63 21 63 41 41 39 2 41 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* * INETPEER - A storage for permanent information about peers * * This source is covered by the GNU GPL, the same as all kernel sources. * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #include <linux/cache.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/timer.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> #include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> #include <net/secure_seq.h> /* * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. * * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can * also be removed if the pool is overloaded i.e. if the total amount of * entries is greater-or-equal than the threshold. * * Node pool is organised as an RB tree. * Such an implementation has been chosen not just for fun. It's a way to * prevent easy and efficient DoS attacks by creating hash collisions. A huge * amount of long living nodes in a single hash slot would significantly delay * lookups performed with disabled BHs. * * Serialisation issues. * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. * 3. Global variable peer_total is modified under the pool lock. * 4. struct inet_peer fields modification: * rb_node: pool lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable */ static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { bp->rb_root = RB_ROOT; seqlock_init(&bp->lock); bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly; /* start to throw entries more * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { u64 nr_entries; /* 1% of physical memory */ nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT, 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer))); inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); peer_cachep = KMEM_CACHE(inet_peer, SLAB_HWCACHE_ALIGN | SLAB_PANIC); } /* Called with rcu_read_lock() or base->lock held */ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, struct inet_peer_base *base, unsigned int seq, struct inet_peer *gc_stack[], unsigned int *gc_cnt, struct rb_node **parent_p, struct rb_node ***pp_p) { struct rb_node **pp, *parent, *next; struct inet_peer *p; u32 now; pp = &base->rb_root.rb_node; parent = NULL; while (1) { int cmp; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { now = jiffies; if (READ_ONCE(p->dtime) != now) WRITE_ONCE(p->dtime, now); return p; } if (gc_stack) { if (*gc_cnt < PEER_MAX_GC) gc_stack[(*gc_cnt)++] = p; } else if (unlikely(read_seqretry(&base->lock, seq))) { break; } if (cmp == -1) pp = &next->rb_left; else pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; return NULL; } /* perform garbage collect on all items stacked during a lookup */ static void inet_peer_gc(struct inet_peer_base *base, struct inet_peer *gc_stack[], unsigned int gc_cnt) { int peer_threshold, peer_maxttl, peer_minttl; struct inet_peer *p; __u32 delta, ttl; int i; peer_threshold = READ_ONCE(inet_peer_threshold); peer_maxttl = READ_ONCE(inet_peer_maxttl); peer_minttl = READ_ONCE(inet_peer_minttl); if (base->total >= peer_threshold) ttl = 0; /* be aggressive */ else ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ * base->total / peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; if (p) { rb_erase(&p->rb_node, &base->rb_root); base->total--; kfree_rcu(p, rcu); } } } /* Must be called under RCU : No refcount change is done here. */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); if (p) return p; /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ parent = NULL; write_seqlock_bh(&base->lock); gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); if (!p) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; refcount_set(&p->refcnt, 1); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->n_redirects = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first * calculation of tokens is at its maximum. */ p->rate_last = jiffies - 60*HZ; rb_link_node(&p->rb_node, parent, pp); rb_insert_color(&p->rb_node, &base->rb_root); base->total++; } } if (gc_cnt) inet_peer_gc(base, gc_stack, gc_cnt); write_sequnlock_bh(&base->lock); return p; } EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { if (refcount_dec_and_test(&p->refcnt)) kfree_rcu(p, rcu); } /* * Check transmit rate limitation for given message. * The rate information is held in the inet_peer entries now. * This function is generic and could be used for other purposes * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * * Note that the same inet_peer fields are modified by functions in * route.c too, but these work for packet destinations while xrlim_allow * works for icmp destinations. This means the rate limiting information * for one "ip object" is shared - and these ICMPs are twice limited: * by source and by destination. * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate * SHOULD allow setting of rate limits * * Shared between ICMPv4 and ICMPv6. */ #define XRLIM_BURST_FACTOR 6 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) { unsigned long now, token, otoken, delta; bool rc = false; if (!peer) return true; token = otoken = READ_ONCE(peer->rate_tokens); now = jiffies; delta = now - READ_ONCE(peer->rate_last); if (delta) { WRITE_ONCE(peer->rate_last, now); token += delta; if (token > XRLIM_BURST_FACTOR * timeout) token = XRLIM_BURST_FACTOR * timeout; } if (token >= timeout) { token -= timeout; rc = true; } if (token != otoken) WRITE_ONCE(peer->rate_tokens, token); return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { struct rb_node *p = rb_first(&base->rb_root); while (p) { struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); p = rb_next(p); rb_erase(&peer->rb_node, &base->rb_root); inet_putpeer(peer); cond_resched(); } base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree);
27324 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H #include <asm/processor.h> #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> enum cpuid_leafs { CPUID_1_EDX = 0, CPUID_8000_0001_EDX, CPUID_8086_0001_EDX, CPUID_LNX_1, CPUID_1_ECX, CPUID_C000_0001_EDX, CPUID_8000_0001_ECX, CPUID_LNX_2, CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, CPUID_LNX_5, NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" #define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) /* * There are 32 bits/features in each mask word. The high bits * (selected with (bit>>5) give us the word number and the low 5 * bits give us the bit/feature number inside the word. * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can * see if it is set in the mask word. */ #define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) /* * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all * header macros which use NCAPINTS need to be changed. The duplicated macro * use causes the compiler to issue errors for all headers so that all usage * sites can be corrected. */ #define REQUIRED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ REQUIRED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ DISABLED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This is the default CPU features testing macro to use in code. * * It is for detection of features which need kernel infrastructure to be * used. It may *not* directly test the CPU itself. Use the cpu_has() family * if you want true runtime testing of CPU features, like in hypervisor code * where you are supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_cap(bit) do { \ \ if (!boot_cpu_has(bit)) \ WARN_ON(alternatives_patched); \ \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) /* * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; t_no: return false; } #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has #define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ #endif /* _ASM_X86_CPUFEATURE_H */
19 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock LSM - Access types and helpers * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI * Copyright © 2024-2025 Microsoft Corporation */ #ifndef _SECURITY_LANDLOCK_ACCESS_H #define _SECURITY_LANDLOCK_ACCESS_H #include <linux/bitops.h> #include <linux/build_bug.h> #include <linux/kernel.h> #include <uapi/linux/landlock.h> #include "limits.h" /* * All access rights that are denied by default whether they are handled or not * by a ruleset/layer. This must be ORed with all ruleset->access_masks[] * entries when we need to get the absolute handled access masks, see * landlock_upgrade_handled_access_masks(). */ /* clang-format off */ #define _LANDLOCK_ACCESS_FS_INITIALLY_DENIED ( \ LANDLOCK_ACCESS_FS_REFER) /* clang-format on */ typedef u16 access_mask_t; /* Makes sure all filesystem access rights can be stored. */ static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_ACCESS_FS); /* Makes sure all network access rights can be stored. */ static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_ACCESS_NET); /* Makes sure all scoped rights can be stored. */ static_assert(BITS_PER_TYPE(access_mask_t) >= LANDLOCK_NUM_SCOPE); /* Makes sure for_each_set_bit() and for_each_clear_bit() calls are OK. */ static_assert(sizeof(unsigned long) >= sizeof(access_mask_t)); /* Ruleset access masks. */ struct access_masks { access_mask_t fs : LANDLOCK_NUM_ACCESS_FS; access_mask_t net : LANDLOCK_NUM_ACCESS_NET; access_mask_t scope : LANDLOCK_NUM_SCOPE; }; union access_masks_all { struct access_masks masks; u32 all; }; /* Makes sure all fields are covered. */ static_assert(sizeof(typeof_member(union access_masks_all, masks)) == sizeof(typeof_member(union access_masks_all, all))); typedef u16 layer_mask_t; /* Makes sure all layers can be checked. */ static_assert(BITS_PER_TYPE(layer_mask_t) >= LANDLOCK_MAX_NUM_LAYERS); /* Upgrades with all initially denied by default access rights. */ static inline struct access_masks landlock_upgrade_handled_access_masks(struct access_masks access_masks) { /* * All access rights that are denied by default whether they are * explicitly handled or not. */ if (access_masks.fs) access_masks.fs |= _LANDLOCK_ACCESS_FS_INITIALLY_DENIED; return access_masks; } #endif /* _SECURITY_LANDLOCK_ACCESS_H */
4 2 1 1 4 5 5 5 3 2 5 5 5 5 5 5 5 5 95 96 95 95 3 96 1 96 1 96 1 95 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 3 3 3 3 2 1 4 6 6 27 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 3 3 3 3 3 3 3 44 32 32 32 44 44 43 31 44 21 9 21 21 3 3 3 3 3 3 20 20 20 20 20 9 20 20 20 20 20 20 23 19 9 9 6 7 6 6 1 5 5 6 6 6 6 13 13 6 6 6 3 3 3 13 13 13 12 14 7 7 5 2 4 3 3 6 6 6 3 4 4 5 5 5 13 23 23 23 18 6 22 22 21 3 3 21 9 17 18 20 2 20 25 18 25 9 9 25 24 24 13 13 12 1 11 11 11 13 13 7 22 22 7 7 7 7 7 7 7 7 7 3 7 7 7 7 7 7 7 7 7 7 7 7 7 7 3 7 7 7 7 7 7 7 5 2 5 1 4 2 10 9 9 9 4 4 9 9 2 7 2 9 9 9 10 6 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/compat.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/uio.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/mount.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/gfs2_ondisk.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/crc32.h> #include <linux/writeback.h> #include <linux/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> #include <linux/delay.h> #include <linux/backing-dev.h> #include <linux/fileattr.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "aops.h" #include "dir.h" #include "glock.h" #include "glops.h" #include "inode.h" #include "log.h" #include "meta_io.h" #include "quota.h" #include "rgrp.h" #include "trans.h" #include "util.h" /** * gfs2_llseek - seek to a location in a file * @file: the file * @offset: the offset * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) * * SEEK_END requires the glock for the file because it references the * file's size. * * Returns: The new offset, or errno */ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_holder i_gh; loff_t error; switch (whence) { case SEEK_END: error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (!error) { error = generic_file_llseek(file, offset, whence); gfs2_glock_dq_uninit(&i_gh); } break; case SEEK_DATA: error = gfs2_seek_data(file, offset); break; case SEEK_HOLE: error = gfs2_seek_hole(file, offset); break; case SEEK_CUR: case SEEK_SET: /* * These don't reference inode->i_size and don't depend on the * block mapping, so we don't need the glock. */ error = generic_file_llseek(file, offset, whence); break; default: error = -EINVAL; } return error; } /** * gfs2_readdir - Iterator for a directory * @file: The directory to read from * @ctx: What to feed directory entries to * * Returns: errno */ static int gfs2_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file->f_mapping->host; struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_holder d_gh; int error; error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) return error; error = gfs2_dir_read(dir, ctx, &file->f_ra); gfs2_glock_dq_uninit(&d_gh); return error; } /* * struct fsflag_gfs2flag * * The FS_JOURNAL_DATA_FL flag maps to GFS2_DIF_INHERIT_JDATA for directories, * and to GFS2_DIF_JDATA for non-directories. */ static struct { u32 fsflag; u32 gfsflag; } fsflag_gfs2flag[] = { {FS_SYNC_FL, GFS2_DIF_SYNC}, {FS_IMMUTABLE_FL, GFS2_DIF_IMMUTABLE}, {FS_APPEND_FL, GFS2_DIF_APPENDONLY}, {FS_NOATIME_FL, GFS2_DIF_NOATIME}, {FS_INDEX_FL, GFS2_DIF_EXHASH}, {FS_TOPDIR_FL, GFS2_DIF_TOPDIR}, {FS_JOURNAL_DATA_FL, GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA}, }; static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags) { int i; u32 fsflags = 0; if (S_ISDIR(inode->i_mode)) gfsflags &= ~GFS2_DIF_JDATA; else gfsflags &= ~GFS2_DIF_INHERIT_JDATA; for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) if (gfsflags & fsflag_gfs2flag[i].gfsflag) fsflags |= fsflag_gfs2flag[i].fsflag; return fsflags; } int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; u32 fsflags; if (d_is_special(dentry)) return -ENOTTY; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); error = gfs2_glock_nq(&gh); if (error) goto out_uninit; fsflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags); fileattr_fill_flags(fa, fsflags); gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); return error; } void gfs2_set_inode_flags(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); unsigned int flags = inode->i_flags; flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) flags |= S_NOSEC; if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) flags |= S_IMMUTABLE; if (ip->i_diskflags & GFS2_DIF_APPENDONLY) flags |= S_APPEND; if (ip->i_diskflags & GFS2_DIF_NOATIME) flags |= S_NOATIME; if (ip->i_diskflags & GFS2_DIF_SYNC) flags |= S_SYNC; inode->i_flags = flags; } /* Flags that can be set by user space */ #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ GFS2_DIF_IMMUTABLE| \ GFS2_DIF_APPENDONLY| \ GFS2_DIF_NOATIME| \ GFS2_DIF_SYNC| \ GFS2_DIF_TOPDIR| \ GFS2_DIF_INHERIT_JDATA) /** * do_gfs2_set_flags - set flags on an inode * @inode: The inode * @reqflags: The flags to set * @mask: Indicates which flags are valid * */ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *bh; struct gfs2_holder gh; int error; u32 new_flags, flags; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (error) return error; error = 0; flags = ip->i_diskflags; new_flags = (flags & ~mask) | (reqflags & mask); if ((new_flags ^ flags) == 0) goto out; if (!IS_IMMUTABLE(inode)) { error = gfs2_permission(&nop_mnt_idmap, inode, MAY_WRITE); if (error) goto out; } if ((flags ^ new_flags) & GFS2_DIF_JDATA) { if (new_flags & GFS2_DIF_JDATA) gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_SET_FLAGS); error = filemap_fdatawrite(inode->i_mapping); if (error) goto out; error = filemap_fdatawait(inode->i_mapping); if (error) goto out; truncate_inode_pages(inode->i_mapping, 0); if (new_flags & GFS2_DIF_JDATA) gfs2_ordered_del_inode(ip); } error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) goto out; error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; inode_set_ctime_current(inode); gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); brelse(bh); gfs2_set_inode_flags(inode); gfs2_set_aops(inode); out_trans_end: gfs2_trans_end(sdp); out: gfs2_glock_dq_uninit(&gh); return error; } int gfs2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); u32 fsflags = fa->flags, gfsflags = 0; u32 mask; int i; if (d_is_special(dentry)) return -ENOTTY; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) { if (fsflags & fsflag_gfs2flag[i].fsflag) { fsflags &= ~fsflag_gfs2flag[i].fsflag; gfsflags |= fsflag_gfs2flag[i].gfsflag; } } if (fsflags || gfsflags & ~GFS2_FLAGS_USER_SET) return -EINVAL; mask = GFS2_FLAGS_USER_SET; if (S_ISDIR(inode->i_mode)) { mask &= ~GFS2_DIF_JDATA; } else { /* The GFS2_DIF_TOPDIR flag is only valid for directories. */ if (gfsflags & GFS2_DIF_TOPDIR) return -EINVAL; mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA); } return do_gfs2_set_flags(inode, gfsflags, mask); } static int gfs2_getlabel(struct file *filp, char __user *label) { struct inode *inode = file_inode(filp); struct gfs2_sbd *sdp = GFS2_SB(inode); if (copy_to_user(label, sdp->sd_sb.sb_locktable, GFS2_LOCKNAME_LEN)) return -EFAULT; return 0; } static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch(cmd) { case FITRIM: return gfs2_fitrim(filp, (void __user *)arg); case FS_IOC_GETFSLABEL: return gfs2_getlabel(filp, (char __user *)arg); } return -ENOTTY; } #ifdef CONFIG_COMPAT static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch(cmd) { /* Keep this list in sync with gfs2_ioctl */ case FITRIM: case FS_IOC_GETFSLABEL: break; default: return -ENOIOCTLCMD; } return gfs2_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); } #else #define gfs2_compat_ioctl NULL #endif /** * gfs2_size_hint - Give a hint to the size of a write request * @filep: The struct file * @offset: The file offset of the write * @size: The length of the write * * When we are about to do a write, this function records the total * write size in order to provide a suitable hint to the lower layers * about how many blocks will be required. * */ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) { struct inode *inode = file_inode(filep); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; int hint = min_t(size_t, INT_MAX, blks); if (hint > atomic_read(&ip->i_sizehint)) atomic_set(&ip->i_sizehint, hint); } /** * gfs2_allocate_folio_backing - Allocate blocks for a write fault * @folio: The (locked) folio to allocate backing for * @length: Size of the allocation * * We try to allocate all the blocks required for the folio in one go. This * might fail for various reasons, so we keep trying until all the blocks to * back this folio are allocated. If some of the blocks are already allocated, * that is ok too. */ static int gfs2_allocate_folio_backing(struct folio *folio, size_t length) { u64 pos = folio_pos(folio); do { struct iomap iomap = { }; if (gfs2_iomap_alloc(folio->mapping->host, pos, length, &iomap)) return -EIO; if (length < iomap.length) iomap.length = length; length -= iomap.length; pos += iomap.length; } while (length > 0); return 0; } /** * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable * @vmf: The virtual memory fault containing the page to become writable * * When the page becomes writable, we need to ensure that we have * blocks allocated on disk to back that page. */ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = {}; u64 pos = folio_pos(folio); unsigned int data_blocks, ind_blocks, rblocks; vm_fault_t ret = VM_FAULT_LOCKED; struct gfs2_holder gh; size_t length; loff_t size; int err; sb_start_pagefault(inode->i_sb); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { ret = vmf_fs_error(err); goto out_uninit; } /* Check folio index against inode size */ size = i_size_read(inode); if (pos >= size) { ret = VM_FAULT_SIGBUS; goto out_unlock; } /* Update file times before taking folio lock */ file_update_time(vmf->vma->vm_file); /* folio is wholly or partially inside EOF */ if (size - pos < folio_size(folio)) length = size - pos; else length = folio_size(folio); gfs2_size_hint(vmf->vma->vm_file, pos, length); set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); /* * iomap_writepage / iomap_writepages currently don't support inline * files, so always unstuff here. */ if (!gfs2_is_stuffed(ip) && !gfs2_write_alloc_required(ip, pos, length)) { folio_lock(folio); if (!folio_test_uptodate(folio) || folio->mapping != inode->i_mapping) { ret = VM_FAULT_NOPAGE; folio_unlock(folio); } goto out_unlock; } err = gfs2_rindex_update(sdp); if (err) { ret = vmf_fs_error(err); goto out_unlock; } gfs2_write_calc_reserv(ip, length, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; err = gfs2_quota_lock_check(ip, &ap); if (err) { ret = vmf_fs_error(err); goto out_unlock; } err = gfs2_inplace_reserve(ip, &ap); if (err) { ret = vmf_fs_error(err); goto out_quota_unlock; } rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); } err = gfs2_trans_begin(sdp, rblocks, 0); if (err) { ret = vmf_fs_error(err); goto out_trans_fail; } /* Unstuff, if required, and allocate backing blocks for folio */ if (gfs2_is_stuffed(ip)) { err = gfs2_unstuff_dinode(ip); if (err) { ret = vmf_fs_error(err); goto out_trans_end; } } folio_lock(folio); /* If truncated, we must retry the operation, we may have raced * with the glock demotion code. */ if (!folio_test_uptodate(folio) || folio->mapping != inode->i_mapping) { ret = VM_FAULT_NOPAGE; goto out_page_locked; } err = gfs2_allocate_folio_backing(folio, length); if (err) ret = vmf_fs_error(err); out_page_locked: if (ret != VM_FAULT_LOCKED) folio_unlock(folio); out_trans_end: gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); out_quota_unlock: gfs2_quota_unlock(ip); out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); if (ret == VM_FAULT_LOCKED) { folio_mark_dirty(folio); folio_wait_stable(folio); } sb_end_pagefault(inode->i_sb); return ret; } static vm_fault_t gfs2_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; vm_fault_t ret; int err; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { ret = vmf_fs_error(err); goto out_uninit; } ret = filemap_fault(vmf); gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); return ret; } static const struct vm_operations_struct gfs2_vm_ops = { .fault = gfs2_fault, .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, }; /** * gfs2_mmap * @file: The file to map * @vma: The VMA which described the mapping * * There is no need to get a lock here unless we should be updating * atime. We ignore any locking errors since the only consequence is * a missed atime update (which will just be deferred until later). * * Returns: 0 */ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); if (!(file->f_flags & O_NOATIME) && !IS_NOATIME(&ip->i_inode)) { struct gfs2_holder i_gh; int error; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; /* grab lock to update inode */ gfs2_glock_dq_uninit(&i_gh); file_accessed(file); } vma->vm_ops = &gfs2_vm_ops; return 0; } /** * gfs2_open_common - This is common to open and atomic_open * @inode: The inode being opened * @file: The file being opened * * This maybe called under a glock or not depending upon how it has * been called. We must always be called under a glock for regular * files, however. For other file types, it does not matter whether * we hold the glock or not. * * Returns: Error code or 0 for success */ int gfs2_open_common(struct inode *inode, struct file *file) { struct gfs2_file *fp; int ret; if (S_ISREG(inode->i_mode)) { ret = generic_file_open(inode, file); if (ret) return ret; if (!gfs2_is_jdata(GFS2_I(inode))) file->f_mode |= FMODE_CAN_ODIRECT; } fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS); if (!fp) return -ENOMEM; mutex_init(&fp->f_fl_mutex); gfs2_assert_warn(GFS2_SB(inode), !file->private_data); file->private_data = fp; if (file->f_mode & FMODE_WRITE) { ret = gfs2_qa_get(GFS2_I(inode)); if (ret) goto fail; } return 0; fail: kfree(file->private_data); file->private_data = NULL; return ret; } /** * gfs2_open - open a file * @inode: the inode to open * @file: the struct file for this opening * * After atomic_open, this function is only used for opening files * which are already cached. We must still get the glock for regular * files to ensure that we have the file size uptodate for the large * file check which is in the common code. That is only an issue for * regular files though. * * Returns: errno */ static int gfs2_open(struct inode *inode, struct file *file) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder i_gh; int error; bool need_unlock = false; if (S_ISREG(ip->i_inode.i_mode)) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; need_unlock = true; } error = gfs2_open_common(inode, file); if (need_unlock) gfs2_glock_dq_uninit(&i_gh); return error; } /** * gfs2_release - called to close a struct file * @inode: the inode the struct file belongs to * @file: the struct file being closed * * Returns: errno */ static int gfs2_release(struct inode *inode, struct file *file) { struct gfs2_inode *ip = GFS2_I(inode); kfree(file->private_data); file->private_data = NULL; if (file->f_mode & FMODE_WRITE) { if (gfs2_rs_active(&ip->i_res)) gfs2_rs_delete(ip); gfs2_qa_put(ip); } return 0; } /** * gfs2_fsync - sync the dirty data for a file (across the cluster) * @file: the file that points to the dentry * @start: the start position in the file to sync * @end: the end position in the file to sync * @datasync: set if we can ignore timestamp changes * * We split the data flushing here so that we don't wait for the data * until after we've also sent the metadata to disk. Note that for * data=ordered, we will write & wait for the data at the log flush * stage anyway, so this is unlikely to make much of a difference * except in the data=writeback case. * * If the fdatawrite fails due to any reason except -EIO, we will * continue the remainder of the fsync, although we'll still report * the error at the end. This is to match filemap_write_and_wait_range() * behaviour. * * Returns: errno */ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; int sync_state = inode->i_state & I_DIRTY; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; if (mapping->nrpages) { ret1 = filemap_fdatawrite_range(mapping, start, end); if (ret1 == -EIO) return ret1; } if (!gfs2_is_jdata(ip)) sync_state &= ~I_DIRTY_PAGES; if (datasync) sync_state &= ~I_DIRTY_SYNC; if (sync_state) { ret = sync_inode_metadata(inode, 1); if (ret) return ret; if (gfs2_is_jdata(ip)) ret = file_write_and_wait(file); if (ret) return ret; gfs2_ail_flush(ip->i_gl, 1); } if (mapping->nrpages) ret = file_fdatawait_range(file, start, end); return ret ? ret : ret1; } static inline bool should_fault_in_pages(struct iov_iter *i, struct kiocb *iocb, size_t *prev_count, size_t *window_size) { size_t count = iov_iter_count(i); size_t size, offs; if (!count) return false; if (!user_backed_iter(i)) return false; /* * Try to fault in multiple pages initially. When that doesn't result * in any progress, fall back to a single page. */ size = PAGE_SIZE; offs = offset_in_page(iocb->ki_pos); if (*prev_count != count) { size_t nr_dirtied; nr_dirtied = max(current->nr_dirtied_pause - current->nr_dirtied, 8); size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT); } *prev_count = count; *window_size = size - offs; return true; } static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, struct gfs2_holder *gh) { struct file *file = iocb->ki_filp; struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); size_t prev_count = 0, window_size = 0; size_t read = 0; ssize_t ret; /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate * that the inode glock may be dropped, fault in the pages manually, * and retry. * * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger * physical as well as manual page faults, and we need to disable both * kinds. * * For direct I/O, gfs2 takes the inode glock in deferred mode. This * locking mode is compatible with other deferred holders, so multiple * processes and nodes can do direct I/O to a file at the same time. * There's no guarantee that reads or writes will be atomic. Any * coordination among readers and writers needs to happen externally. */ if (!iov_iter_count(to)) return 0; /* skip atime */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; pagefault_disable(); to->nofault = true; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, IOMAP_DIO_PARTIAL, NULL, read); to->nofault = false; pagefault_enable(); if (ret <= 0 && ret != -EFAULT) goto out_unlock; /* No increment (+=) because iomap_dio_rw returns a cumulative value. */ if (ret > 0) read = ret; if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { gfs2_glock_dq(gh); window_size -= fault_in_iov_iter_writeable(to, window_size); if (window_size) goto retry; } out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); /* User space doesn't expect partial success. */ if (ret < 0) return ret; return read; } static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct gfs2_holder *gh) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); size_t prev_count = 0, window_size = 0; size_t written = 0; bool enough_retries; ssize_t ret; /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate * that the inode glock may be dropped, fault in the pages manually, * and retry. * * For writes, iomap_dio_rw only triggers manual page faults, so we * don't need to disable physical ones. */ /* * Deferred lock, even if its a write, since we do no allocation on * this path. All we need to change is the atime, and this lock mode * ensures that other nodes have flushed their buffered read caches * (i.e. their page cache entries for this inode). We do not, * unfortunately, have the option of only flushing a range like the * VFS does. */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) goto out_unlock; from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, IOMAP_DIO_PARTIAL, NULL, written); from->nofault = false; if (ret <= 0) { if (ret == -ENOTBLK) ret = 0; if (ret != -EFAULT) goto out_unlock; } /* No increment (+=) because iomap_dio_rw returns a cumulative value. */ if (ret > 0) written = ret; enough_retries = prev_count == iov_iter_count(from) && window_size <= PAGE_SIZE; if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { gfs2_glock_dq(gh); window_size -= fault_in_iov_iter_readable(from, window_size); if (window_size) { if (!enough_retries) goto retry; /* fall back to buffered I/O */ ret = 0; } } out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); /* User space doesn't expect partial success. */ if (ret < 0) return ret; return written; } static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct gfs2_inode *ip; struct gfs2_holder gh; size_t prev_count = 0, window_size = 0; size_t read = 0; ssize_t ret; /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate * that the inode glock may be dropped, fault in the pages manually, * and retry. */ if (iocb->ki_flags & IOCB_DIRECT) return gfs2_file_direct_read(iocb, to, &gh); pagefault_disable(); iocb->ki_flags |= IOCB_NOIO; ret = generic_file_read_iter(iocb, to); iocb->ki_flags &= ~IOCB_NOIO; pagefault_enable(); if (ret >= 0) { if (!iov_iter_count(to)) return ret; read = ret; } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; if (iocb->ki_flags & IOCB_NOWAIT) return ret; } ip = GFS2_I(iocb->ki_filp->f_mapping->host); gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); retry: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; pagefault_disable(); ret = generic_file_read_iter(iocb, to); pagefault_enable(); if (ret <= 0 && ret != -EFAULT) goto out_unlock; if (ret > 0) read += ret; if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { gfs2_glock_dq(&gh); window_size -= fault_in_iov_iter_writeable(to, window_size); if (window_size) goto retry; } out_unlock: if (gfs2_holder_queued(&gh)) gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); return read ? read : ret; } static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, struct gfs2_holder *gh) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder *statfs_gh = NULL; size_t prev_count = 0, window_size = 0; size_t orig_count = iov_iter_count(from); size_t written = 0; ssize_t ret; /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate * that the inode glock may be dropped, fault in the pages manually, * and retry. */ if (inode == sdp->sd_rindex) { statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS); if (!statfs_gh) return -ENOMEM; } gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { retry: window_size -= fault_in_iov_iter_readable(from, window_size); if (!window_size) { ret = -EFAULT; goto out_uninit; } from->count = min(from->count, window_size); } ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, statfs_gh); if (ret) goto out_unlock; } pagefault_disable(); ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL); pagefault_enable(); if (ret > 0) written += ret; if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); if (ret <= 0 && ret != -EFAULT) goto out_unlock; from->count = orig_count - written; if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { gfs2_glock_dq(gh); goto retry; } out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); kfree(statfs_gh); from->count = orig_count - written; return written ? written : ret; } /** * gfs2_file_write_iter - Perform a write to a file * @iocb: The io context * @from: The data to write * * We have to do a lock/unlock here to refresh the inode size for * O_APPEND writes, otherwise we can land up writing at the wrong * offset. There is still a race, but provided the app is using its * own file locking, this will make O_APPEND work as expected. * */ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; ssize_t ret; gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from)); if (iocb->ki_flags & IOCB_APPEND) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) return ret; gfs2_glock_dq_uninit(&gh); } inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto out_unlock; ret = file_remove_privs(file); if (ret) goto out_unlock; if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; ssize_t buffered, ret2; /* * Note that under direct I/O, we don't allow and inode * timestamp updates, so we're not calling file_update_time() * here. */ ret = gfs2_file_direct_write(iocb, from, &gh); if (ret < 0 || !iov_iter_count(from)) goto out_unlock; iocb->ki_flags |= IOCB_DSYNC; buffered = gfs2_file_buffered_write(iocb, from, &gh); if (unlikely(buffered <= 0)) { if (!ret) ret = buffered; goto out_unlock; } /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. If the writeback or invalidate fails, only report * the direct I/O range as we don't know if the buffered pages * made it to disk. */ ret2 = generic_write_sync(iocb, buffered); invalidate_mapping_pages(mapping, (iocb->ki_pos - buffered) >> PAGE_SHIFT, (iocb->ki_pos - 1) >> PAGE_SHIFT); if (!ret || ret2 > 0) ret += ret2; } else { ret = file_update_time(file); if (ret) goto out_unlock; ret = gfs2_file_buffered_write(iocb, from, &gh); if (likely(ret > 0)) ret = generic_write_sync(iocb, ret); } out_unlock: inode_unlock(inode); return ret; } static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, int mode) { struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); loff_t end = offset + len; struct buffer_head *dibh; int error; error = gfs2_meta_inode_buffer(ip, &dibh); if (unlikely(error)) return error; gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { error = gfs2_unstuff_dinode(ip); if (unlikely(error)) goto out; } while (offset < end) { struct iomap iomap = { }; error = gfs2_iomap_alloc(inode, offset, end - offset, &iomap); if (error) goto out; offset = iomap.offset + iomap.length; if (!(iomap.flags & IOMAP_F_NEW)) continue; error = sb_issue_zeroout(sb, iomap.addr >> inode->i_blkbits, iomap.length >> inode->i_blkbits, GFP_NOFS); if (error) { fs_err(GFS2_SB(inode), "Failed to zero data buffers\n"); goto out; } } out: brelse(dibh); return error; } /** * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of * blocks, determine how many bytes can be written. * @ip: The inode in question. * @len: Max cap of bytes. What we return in *len must be <= this. * @data_blocks: Compute and return the number of data blocks needed * @ind_blocks: Compute and return the number of indirect blocks needed * @max_blocks: The total blocks available to work with. * * Returns: void, but @len, @data_blocks and @ind_blocks are filled in. */ static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len, unsigned int *data_blocks, unsigned int *ind_blocks, unsigned int max_blocks) { loff_t max = *len; const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); for (tmp = max_data; tmp > sdp->sd_diptrs;) { tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); max_data -= tmp; } *data_blocks = max_data; *ind_blocks = max_blocks - max_data; *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; if (*len > max) { *len = max; gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); } } static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_alloc_parms ap = {}; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; loff_t bytes, max_bytes, max_blks; int error; const loff_t pos = offset; const loff_t count = len; loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; loff_t max_chunk_size = UINT_MAX & bsize_mask; next = (next + 1) << sdp->sd_sb.sb_bsize_shift; offset &= bsize_mask; len = next - offset; bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; if (!bytes) bytes = UINT_MAX; bytes &= bsize_mask; if (bytes == 0) bytes = sdp->sd_sb.sb_bsize; gfs2_size_hint(file, offset, len); gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); ap.min_target = data_blocks + ind_blocks; while (len > 0) { if (len < bytes) bytes = len; if (!gfs2_write_alloc_required(ip, offset, bytes)) { len -= bytes; offset += bytes; continue; } /* We need to determine how many bytes we can actually * fallocate without exceeding quota or going over the * end of the fs. We start off optimistically by assuming * we can write max_bytes */ max_bytes = (len > max_chunk_size) ? max_chunk_size : len; /* Since max_bytes is most likely a theoretical max, we * calculate a more realistic 'bytes' to serve as a good * starting point for the number of bytes we may be able * to write */ gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; error = gfs2_quota_lock_check(ip, &ap); if (error) return error; /* ap.allowed tells us how many blocks quota will allow * us to write. Check if this reduces max_blks */ max_blks = UINT_MAX; if (ap.allowed) max_blks = ap.allowed; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; /* check if the selected rgrp limits our max_blks further */ if (ip->i_res.rs_reserved < max_blks) max_blks = ip->i_res.rs_reserved; /* Almost done. Calculate bytes that can be written using * max_blks. We also recompute max_bytes, data_blocks and * ind_blocks */ calc_max_reserv(ip, &max_bytes, &data_blocks, &ind_blocks, max_blks); rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks); if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; error = gfs2_trans_begin(sdp, rblocks, PAGE_SIZE >> inode->i_blkbits); if (error) goto out_trans_fail; error = fallocate_chunk(inode, offset, max_bytes, mode); gfs2_trans_end(sdp); if (error) goto out_trans_fail; len -= max_bytes; offset += max_bytes; gfs2_inplace_release(ip); gfs2_quota_unlock(ip); } if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) i_size_write(inode, pos + count); file_update_time(file); mark_inode_dirty(inode); if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) return vfs_fsync_range(file, pos, pos + count - 1, (file->f_flags & __O_SYNC) ? 0 : 1); return 0; out_trans_fail: gfs2_inplace_release(ip); out_qunlock: gfs2_quota_unlock(ip); return error; } static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; if (mode & ~(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* fallocate is needed by gfs2_grow to reserve space in the rindex */ if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex) return -EOPNOTSUPP; inode_lock(inode); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len) > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); if (ret) goto out_unlock; } ret = get_write_access(inode); if (ret) goto out_unlock; if (mode & FALLOC_FL_PUNCH_HOLE) { ret = __gfs2_punch_hole(file, offset, len); } else { ret = __gfs2_fallocate(file, mode, offset, len); if (ret) gfs2_rs_deltree(&ip->i_res); } put_write_access(inode); out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); inode_unlock(inode); return ret; } static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { ssize_t ret; gfs2_size_hint(out, *ppos, len); ret = iter_file_splice_write(pipe, out, ppos, len, flags); return ret; } #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** * gfs2_lock - acquire/release a posix lock on a file * @file: the file pointer * @cmd: either modify or retrieve lock state, possibly wait * @fl: type and range of lock * * Returns: errno */ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); struct lm_lockstruct *ls = &sdp->sd_lockstruct; if (!(fl->c.flc_flags & FL_POSIX)) return -ENOLCK; if (gfs2_withdrawing_or_withdrawn(sdp)) { if (lock_is_unlock(fl)) locks_lock_file_wait(file, fl); return -EIO; } if (cmd == F_CANCELLK) return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); else if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); else if (lock_is_unlock(fl)) return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); else return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); } static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) { struct gfs2_glock *gl = gfs2_glock_hold(fl_gh->gh_gl); /* * Make sure gfs2_glock_put() won't sleep under the file->f_lock * spinlock. */ spin_lock(&file->f_lock); gfs2_holder_uninit(fl_gh); spin_unlock(&file->f_lock); gfs2_glock_put(gl); } static int do_flock(struct file *file, int cmd, struct file_lock *fl) { struct gfs2_file *fp = file->private_data; struct gfs2_holder *fl_gh = &fp->f_fl_gh; struct gfs2_inode *ip = GFS2_I(file_inode(file)); struct gfs2_glock *gl; unsigned int state; u16 flags; int error = 0; int sleeptime; state = lock_is_write(fl) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; flags = GL_EXACT | GL_NOPID; if (!IS_SETLKW(cmd)) flags |= LM_FLAG_TRY_1CB; mutex_lock(&fp->f_fl_mutex); if (gfs2_holder_initialized(fl_gh)) { struct file_lock request; if (fl_gh->gh_state == state) goto out; locks_init_lock(&request); request.c.flc_type = F_UNLCK; request.c.flc_flags = FL_FLOCK; locks_lock_file_wait(file, &request); gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, &gfs2_flock_glops, CREATE, &gl); if (error) goto out; spin_lock(&file->f_lock); gfs2_holder_init(gl, state, flags, fl_gh); spin_unlock(&file->f_lock); gfs2_glock_put(gl); } for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { error = gfs2_glock_nq(fl_gh); if (error != GLR_TRYFAILED) break; fl_gh->gh_flags &= ~LM_FLAG_TRY_1CB; fl_gh->gh_flags |= LM_FLAG_TRY; msleep(sleeptime); } if (error) { __flock_holder_uninit(file, fl_gh); if (error == GLR_TRYFAILED) error = -EAGAIN; } else { error = locks_lock_file_wait(file, fl); gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); } out: mutex_unlock(&fp->f_fl_mutex); return error; } static void do_unflock(struct file *file, struct file_lock *fl) { struct gfs2_file *fp = file->private_data; struct gfs2_holder *fl_gh = &fp->f_fl_gh; mutex_lock(&fp->f_fl_mutex); locks_lock_file_wait(file, fl); if (gfs2_holder_initialized(fl_gh)) { gfs2_glock_dq(fl_gh); __flock_holder_uninit(file, fl_gh); } mutex_unlock(&fp->f_fl_mutex); } /** * gfs2_flock - acquire/release a flock lock on a file * @file: the file pointer * @cmd: either modify or retrieve lock state, possibly wait * @fl: type and range of lock * * Returns: errno */ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) { if (!(fl->c.flc_flags & FL_FLOCK)) return -ENOLCK; if (lock_is_unlock(fl)) { do_unflock(file, fl); return 0; } else { return do_flock(file, cmd, fl); } } const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, .splice_read = copy_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, .fop_flags = FOP_ASYNC_LOCK, }; const struct file_operations gfs2_dir_fops = { .iterate_shared = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, .llseek = default_llseek, .fop_flags = FOP_ASYNC_LOCK, }; #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, .splice_read = copy_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, }; const struct file_operations gfs2_dir_fops_nolock = { .iterate_shared = gfs2_readdir, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, .llseek = default_llseek, };
5 5 5 5 2 5 5 5 5 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. * * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND * research group. For further information please see https://www.wand.net.nz/ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz * * This code also uses code from Lulea University, rereleased as GPL by its * authors: * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon * * Changes to meet Linux coding standards, to make it meet latest ccid3 draft * and to make it work as a loadable module in the DCCP stack written by * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. * * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/string.h> #include <linux/slab.h> #include "packet_history.h" #include "../../dccp.h" /* * Transmitter History Routines */ static struct kmem_cache *tfrc_tx_hist_slab; int __init tfrc_tx_packet_history_init(void) { tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist", sizeof(struct tfrc_tx_hist_entry), 0, SLAB_HWCACHE_ALIGN, NULL); return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; } void tfrc_tx_packet_history_exit(void) { if (tfrc_tx_hist_slab != NULL) { kmem_cache_destroy(tfrc_tx_hist_slab); tfrc_tx_hist_slab = NULL; } } int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); if (entry == NULL) return -ENOBUFS; entry->seqno = seqno; entry->stamp = ktime_get_real(); entry->next = *headp; *headp = entry; return 0; } void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) { struct tfrc_tx_hist_entry *head = *headp; while (head != NULL) { struct tfrc_tx_hist_entry *next = head->next; kmem_cache_free(tfrc_tx_hist_slab, head); head = next; } *headp = NULL; } /* * Receiver History Routines */ static struct kmem_cache *tfrc_rx_hist_slab; int __init tfrc_rx_packet_history_init(void) { tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache", sizeof(struct tfrc_rx_hist_entry), 0, SLAB_HWCACHE_ALIGN, NULL); return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0; } void tfrc_rx_packet_history_exit(void) { if (tfrc_rx_hist_slab != NULL) { kmem_cache_destroy(tfrc_rx_hist_slab); tfrc_rx_hist_slab = NULL; } } static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry, const struct sk_buff *skb, const u64 ndp) { const struct dccp_hdr *dh = dccp_hdr(skb); entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; entry->tfrchrx_ccval = dh->dccph_ccval; entry->tfrchrx_type = dh->dccph_type; entry->tfrchrx_ndp = ndp; entry->tfrchrx_tstamp = ktime_get_real(); } void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u64 ndp) { struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h); tfrc_rx_hist_entry_from_skb(entry, skb, ndp); } /* has the packet contained in skb been seen before? */ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) { const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq; int i; if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0) return 1; for (i = 1; i <= h->loss_count; i++) if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq) return 1; return 0; } static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { const u8 idx_a = tfrc_rx_hist_index(h, a), idx_b = tfrc_rx_hist_index(h, b); swap(h->ring[idx_a], h->ring[idx_b]); } /* * Private helper functions for loss detection. * * In the descriptions, `Si' refers to the sequence number of entry number i, * whose NDP count is `Ni' (lower case is used for variables). * Note: All __xxx_loss functions expect that a test against duplicates has been * performed already: the seqno of the skb must not be less than the seqno * of loss_prev; and it must not equal that of any valid history entry. */ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) { u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, s1 = DCCP_SKB_CB(skb)->dccpd_seq; if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ h->loss_count = 1; tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); } } static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) { u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, s2 = DCCP_SKB_CB(skb)->dccpd_seq; if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */ h->loss_count = 2; tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2); return; } /* S0 < S2 < S1 */ if (dccp_loss_free(s0, s2, n2)) { u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; if (dccp_loss_free(s2, s1, n1)) { /* hole is filled: S0, S2, and S1 are consecutive */ h->loss_count = 0; h->loss_start = tfrc_rx_hist_index(h, 1); } else /* gap between S2 and S1: just update loss_prev */ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); } else { /* gap between S0 and S2 */ /* * Reorder history to insert S2 between S0 and S1 */ tfrc_rx_hist_swap(h, 0, 3); h->loss_start = tfrc_rx_hist_index(h, 3); tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2); h->loss_count = 2; } } /* return 1 if a new loss event has been identified */ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) { u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, s3 = DCCP_SKB_CB(skb)->dccpd_seq; if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */ h->loss_count = 3; tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3); return 1; } /* S3 < S2 */ if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */ /* * Reorder history to insert S3 between S1 and S2 */ tfrc_rx_hist_swap(h, 2, 3); tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3); h->loss_count = 3; return 1; } /* S0 < S3 < S1 */ if (dccp_loss_free(s0, s3, n3)) { u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; if (dccp_loss_free(s3, s1, n1)) { /* hole between S0 and S1 filled by S3 */ u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp; if (dccp_loss_free(s1, s2, n2)) { /* entire hole filled by S0, S3, S1, S2 */ h->loss_start = tfrc_rx_hist_index(h, 2); h->loss_count = 0; } else { /* gap remains between S1 and S2 */ h->loss_start = tfrc_rx_hist_index(h, 1); h->loss_count = 1; } } else /* gap exists between S3 and S1, loss_count stays at 2 */ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3); return 0; } /* * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3 * Reorder history to insert S3 between S0 and S1. */ tfrc_rx_hist_swap(h, 0, 3); h->loss_start = tfrc_rx_hist_index(h, 3); tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3); h->loss_count = 3; return 1; } /* recycle RX history records to continue loss detection if necessary */ static void __three_after_loss(struct tfrc_rx_hist *h) { /* * At this stage we know already that there is a gap between S0 and S1 * (since S0 was the highest sequence number received before detecting * the loss). To recycle the loss record, it is thus only necessary to * check for other possible gaps between S1/S2 and between S2/S3. */ u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno; u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp, n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp; if (dccp_loss_free(s1, s2, n2)) { if (dccp_loss_free(s2, s3, n3)) { /* no gap between S2 and S3: entire hole is filled */ h->loss_start = tfrc_rx_hist_index(h, 3); h->loss_count = 0; } else { /* gap between S2 and S3 */ h->loss_start = tfrc_rx_hist_index(h, 2); h->loss_count = 1; } } else { /* gap between S1 and S2 */ h->loss_start = tfrc_rx_hist_index(h, 1); h->loss_count = 2; } } /** * tfrc_rx_handle_loss - Loss detection and further processing * @h: The non-empty RX history object * @lh: Loss Intervals database to update * @skb: Currently received packet * @ndp: The NDP count belonging to @skb * @calc_first_li: Caller-dependent computation of first loss interval in @lh * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) * * Chooses action according to pending loss, updates LI database when a new * loss was detected, and does required post-processing. Returns 1 when caller * should send feedback, 0 otherwise. * Since it also takes care of reordering during loss detection and updates the * records accordingly, the caller should not perform any more RX history * operations when loss_count is greater than 0 after calling this function. */ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh, struct sk_buff *skb, const u64 ndp, u32 (*calc_first_li)(struct sock *), struct sock *sk) { int is_new_loss = 0; if (h->loss_count == 0) { __do_track_loss(h, skb, ndp); } else if (h->loss_count == 1) { __one_after_loss(h, skb, ndp); } else if (h->loss_count != 2) { DCCP_BUG("invalid loss_count %d", h->loss_count); } else if (__two_after_loss(h, skb, ndp)) { /* * Update Loss Interval database and recycle RX records */ is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); __three_after_loss(h); } return is_new_loss; } int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) { int i; for (i = 0; i <= TFRC_NDUPACK; i++) { h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); if (h->ring[i] == NULL) goto out_free; } h->loss_count = h->loss_start = 0; return 0; out_free: while (i-- != 0) { kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); h->ring[i] = NULL; } return -ENOBUFS; } void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) { int i; for (i = 0; i <= TFRC_NDUPACK; ++i) if (h->ring[i] != NULL) { kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); h->ring[i] = NULL; } } /** * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against * @h: The non-empty RX history object */ static inline struct tfrc_rx_hist_entry * tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) { return h->ring[0]; } /** * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry * @h: The non-empty RX history object */ static inline struct tfrc_rx_hist_entry * tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) { return h->ring[h->rtt_sample_prev]; } /** * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal * @h: receive histogram * @skb: packet containing timestamp. * * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able * to compute a sample with given data - calling function should check this. */ u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) { u32 sample = 0, delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ if (h->rtt_sample_prev == 2) { /* previous candidate stored */ sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); if (sample) sample = 4 / sample * ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); else /* * FIXME: This condition is in principle not * possible but occurs when CCID is used for * two-way data traffic. I have tried to trace * it, but the cause does not seem to be here. */ DCCP_BUG("please report to dccp@vger.kernel.org" " => prev = %u, last = %u", tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); } else if (delta_v < 1) { h->rtt_sample_prev = 1; goto keep_ref_for_next_time; } } else if (delta_v == 4) /* optimal match */ sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); else { /* suboptimal match */ h->rtt_sample_prev = 2; goto keep_ref_for_next_time; } if (unlikely(sample > DCCP_SANE_RTT_MAX)) { DCCP_WARN("RTT sample %u too large, using max\n", sample); sample = DCCP_SANE_RTT_MAX; } h->rtt_sample_prev = 0; /* use current entry as next reference */ keep_ref_for_next_time: return sample; }
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/dir.c * * Copyright (C) 1992 Rick Sladkey * * nfs directory handling functions * * 10 Apr 1996 Added silly rename for unlink --okir * 28 Sep 1996 Improved directory cache --okir * 23 Aug 1997 Claus Heine claus@momo.math.rwth-aachen.de * Re-implemented silly rename for unlink, newly implemented * silly rename for nfs_rename() following the suggestions * of Olaf Kirch (okir) found in this file. * Following Linus comments on my original hack, this version * depends only on the dcache stuff and doesn't touch the inode * layer (iput() and friends). * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ #include <linux/compat.h> #include <linux/module.h> #include <linux/time.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/swap.h> #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> #include <linux/hash.h> #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "nfstrace.h" /* #define NFS_DEBUG_VERBOSE 1 */ static int nfs_opendir(struct inode *, struct file *); static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, struct dir_context *); static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); static void nfs_readdir_clear_array(struct folio *); static int nfs_do_create(struct inode *dir, struct dentry *dentry, umode_t mode, int open_flags); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, .iterate_shared = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, }; const struct address_space_operations nfs_dir_aops = { .free_folio = nfs_readdir_clear_array, }; #define NFS_INIT_DTSIZE PAGE_SIZE static struct nfs_open_dir_context * alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); if (ctx != NULL) { ctx->attr_gencount = nfsi->attr_gencount; ctx->dtsize = NFS_INIT_DTSIZE; spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) nfs_set_cache_invalid(dir, NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf)); spin_unlock(&dir->i_lock); return ctx; } return ERR_PTR(-ENOMEM); } static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx) { spin_lock(&dir->i_lock); list_del_rcu(&ctx->list); spin_unlock(&dir->i_lock); kfree_rcu(ctx, rcu_head); } /* * Open file */ static int nfs_opendir(struct inode *inode, struct file *filp) { int res = 0; struct nfs_open_dir_context *ctx; dfprintk(FILE, "NFS: open dir(%pD2)\n", filp); nfs_inc_stats(inode, NFSIOS_VFSOPEN); ctx = alloc_nfs_open_dir_context(inode); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; } filp->private_data = ctx; out: return res; } static int nfs_closedir(struct inode *inode, struct file *filp) { put_nfs_open_dir_context(file_inode(filp), filp->private_data); return 0; } struct nfs_cache_array_entry { u64 cookie; u64 ino; const char *name; unsigned int name_len; unsigned char d_type; }; struct nfs_cache_array { u64 change_attr; u64 last_cookie; unsigned int size; unsigned char folio_full : 1, folio_is_eof : 1, cookies_are_ordered : 1; struct nfs_cache_array_entry array[] __counted_by(size); }; struct nfs_readdir_descriptor { struct file *file; struct folio *folio; struct dir_context *ctx; pgoff_t folio_index; pgoff_t folio_index_max; u64 dir_cookie; u64 last_cookie; loff_t current_index; __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; unsigned long attr_gencount; unsigned int cache_entry_index; unsigned int buffer_fills; unsigned int dtsize; bool clear_cache; bool plus; bool eob; bool eof; }; static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz) { struct nfs_server *server = NFS_SERVER(file_inode(desc->file)); unsigned int maxsize = server->dtsize; if (sz > maxsize) sz = maxsize; if (sz < NFS_MIN_FILE_IO_SIZE) sz = NFS_MIN_FILE_IO_SIZE; desc->dtsize = sz; } static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc) { nfs_set_dtsize(desc, desc->dtsize >> 1); } static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc) { nfs_set_dtsize(desc, desc->dtsize << 1); } static void nfs_readdir_folio_init_array(struct folio *folio, u64 last_cookie, u64 change_attr) { struct nfs_cache_array *array; array = kmap_local_folio(folio, 0); array->change_attr = change_attr; array->last_cookie = last_cookie; array->size = 0; array->folio_full = 0; array->folio_is_eof = 0; array->cookies_are_ordered = 1; kunmap_local(array); } /* * we are freeing strings created by nfs_add_to_readdir_array() */ static void nfs_readdir_clear_array(struct folio *folio) { struct nfs_cache_array *array; unsigned int i; array = kmap_local_folio(folio, 0); for (i = 0; i < array->size; i++) kfree(array->array[i].name); array->size = 0; kunmap_local(array); } static void nfs_readdir_folio_reinit_array(struct folio *folio, u64 last_cookie, u64 change_attr) { nfs_readdir_clear_array(folio); nfs_readdir_folio_init_array(folio, last_cookie, change_attr); } static struct folio * nfs_readdir_folio_array_alloc(u64 last_cookie, gfp_t gfp_flags) { struct folio *folio = folio_alloc(gfp_flags, 0); if (folio) nfs_readdir_folio_init_array(folio, last_cookie, 0); return folio; } static void nfs_readdir_folio_array_free(struct folio *folio) { if (folio) { nfs_readdir_clear_array(folio); folio_put(folio); } } static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array) { return array->size == 0 ? array->last_cookie : array->array[0].cookie; } static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) { array->folio_is_eof = 1; array->folio_full = 1; } static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) { return array->folio_full; } /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in * nfs_clear_readdir_array() */ static const char *nfs_readdir_copy_name(const char *name, unsigned int len) { const char *ret = kmemdup_nul(name, len, GFP_KERNEL); /* * Avoid a kmemleak false positive. The pointer to the name is stored * in a page cache page which kmemleak does not scan. */ if (ret != NULL) kmemleak_not_leak(ret); return ret; } static size_t nfs_readdir_array_maxentries(void) { return (PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry); } /* * Check that the next array entry lies entirely within the page bounds */ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) { if (array->folio_full) return -ENOSPC; if (array->size == nfs_readdir_array_maxentries()) { array->folio_full = 1; return -ENOSPC; } return 0; } static int nfs_readdir_folio_array_append(struct folio *folio, const struct nfs_entry *entry, u64 *cookie) { struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; const char *name; int ret = -ENOMEM; name = nfs_readdir_copy_name(entry->name, entry->len); array = kmap_local_folio(folio, 0); if (!name) goto out; ret = nfs_readdir_array_can_expand(array); if (ret) { kfree(name); goto out; } array->size++; cache_entry = &array->array[array->size - 1]; cache_entry->cookie = array->last_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; cache_entry->name_len = entry->len; cache_entry->name = name; array->last_cookie = entry->cookie; if (array->last_cookie <= cache_entry->cookie) array->cookies_are_ordered = 0; if (entry->eof != 0) nfs_readdir_array_set_eof(array); out: *cookie = array->last_cookie; kunmap_local(array); return ret; } #define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14) /* * Hash algorithm allowing content addressible access to sequences * of directory cookies. Content is addressed by the value of the * cookie index of the first readdir entry in a page. * * We select only the first 18 bits to avoid issues with excessive * memory use for the page cache XArray. 18 bits should allow the caching * of 262144 pages of sequences of readdir entries. Since each page holds * 127 readdir entries for a typical 64-bit system, that works out to a * cache of ~ 33 million entries per directory. */ static pgoff_t nfs_readdir_folio_cookie_hash(u64 cookie) { if (cookie == 0) return 0; return hash_64(cookie, 18); } static bool nfs_readdir_folio_validate(struct folio *folio, u64 last_cookie, u64 change_attr) { struct nfs_cache_array *array = kmap_local_folio(folio, 0); int ret = true; if (array->change_attr != change_attr) ret = false; if (nfs_readdir_array_index_cookie(array) != last_cookie) ret = false; kunmap_local(array); return ret; } static void nfs_readdir_folio_unlock_and_put(struct folio *folio) { folio_unlock(folio); folio_put(folio); } static void nfs_readdir_folio_init_and_validate(struct folio *folio, u64 cookie, u64 change_attr) { if (folio_test_uptodate(folio)) { if (nfs_readdir_folio_validate(folio, cookie, change_attr)) return; nfs_readdir_clear_array(folio); } nfs_readdir_folio_init_array(folio, cookie, change_attr); folio_mark_uptodate(folio); } static struct folio *nfs_readdir_folio_get_locked(struct address_space *mapping, u64 cookie, u64 change_attr) { pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); struct folio *folio; folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) return NULL; nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); return folio; } static u64 nfs_readdir_folio_last_cookie(struct folio *folio) { struct nfs_cache_array *array; u64 ret; array = kmap_local_folio(folio, 0); ret = array->last_cookie; kunmap_local(array); return ret; } static bool nfs_readdir_folio_needs_filling(struct folio *folio) { struct nfs_cache_array *array; bool ret; array = kmap_local_folio(folio, 0); ret = !nfs_readdir_array_is_full(array); kunmap_local(array); return ret; } static void nfs_readdir_folio_set_eof(struct folio *folio) { struct nfs_cache_array *array; array = kmap_local_folio(folio, 0); nfs_readdir_array_set_eof(array); kunmap_local(array); } static struct folio *nfs_readdir_folio_get_next(struct address_space *mapping, u64 cookie, u64 change_attr) { pgoff_t index = nfs_readdir_folio_cookie_hash(cookie); struct folio *folio; folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return NULL; nfs_readdir_folio_init_and_validate(folio, cookie, change_attr); if (nfs_readdir_folio_last_cookie(folio) != cookie) nfs_readdir_folio_reinit_array(folio, cookie, change_attr); return folio; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } static bool nfs_readdir_use_cookie(const struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return false; return true; } static void nfs_readdir_seek_next_array(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { if (array->folio_full) { desc->last_cookie = array->last_cookie; desc->current_index += array->size; desc->cache_entry_index = 0; desc->folio_index++; } else desc->last_cookie = nfs_readdir_array_index_cookie(array); } static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc) { desc->current_index = 0; desc->last_cookie = 0; desc->folio_index = 0; } static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; if (diff < 0) goto out_eof; if (diff >= array->size) { if (array->folio_is_eof) goto out_eof; nfs_readdir_seek_next_array(array, desc); return -EAGAIN; } index = (unsigned int)diff; desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; return 0; out_eof: desc->eof = true; return -EBADCOOKIE; } static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, u64 cookie) { if (!array->cookies_are_ordered) return true; /* Optimisation for monotonically increasing cookies */ if (cookie >= array->last_cookie) return false; if (array->size && cookie < array->array[0].cookie) return false; return true; } static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { unsigned int i; int status = -EAGAIN; if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) goto check_eof; for (i = 0; i < array->size; i++) { if (array->array[i].cookie == desc->dir_cookie) { if (nfs_readdir_use_cookie(desc->file)) desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos = desc->current_index + i; desc->cache_entry_index = i; return 0; } } check_eof: if (array->folio_is_eof) { status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) desc->eof = true; } else nfs_readdir_seek_next_array(array, desc); return status; } static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) { struct nfs_cache_array *array; int status; array = kmap_local_folio(desc->folio, 0); if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); kunmap_local(array); return status; } /* Fill a page with xdr information before transferring to the cache page */ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, __be32 *verf, u64 cookie, struct page **pages, size_t bufsize, __be32 *verf_res) { struct inode *inode = file_inode(desc->file); struct nfs_readdir_arg arg = { .dentry = file_dentry(desc->file), .cred = desc->file->f_cred, .verf = verf, .cookie = cookie, .pages = pages, .page_len = bufsize, .plus = desc->plus, }; struct nfs_readdir_res res = { .verf = verf_res, }; unsigned long timestamp, gencount; int error; again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); desc->dir_verifier = nfs_save_change_attribute(inode); error = NFS_PROTO(inode)->readdir(&arg, &res); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; desc->plus = arg.plus = false; goto again; } goto error; } desc->timestamp = timestamp; desc->gencount = gencount; error: return error; } static int xdr_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { struct inode *inode = file_inode(desc->file); int error; error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus); if (error) return error; entry->fattr->time_start = desc->timestamp; entry->fattr->gencount = desc->gencount; return 0; } /* Match file and dirent using either filehandle or fileid * Note: caller is responsible for checking the fsid */ static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { struct inode *inode; struct nfs_inode *nfsi; if (d_really_is_negative(dentry)) return 0; inode = d_inode(dentry); if (is_bad_inode(inode) || NFS_STALE(inode)) return 0; nfsi = NFS_I(inode); if (entry->fattr->fileid != nfsi->fileid) return 0; if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0) return 0; return 1; } #define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL) static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, unsigned int cache_hits, unsigned int cache_misses) { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; if (ctx->pos == 0 || cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; return false; } /* * This function is called by the getattr code to request the * use of readdirplus to accelerate any future lookups in the same * directory. */ void nfs_readdir_record_entry_cache_hit(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && S_ISDIR(dir->i_mode)) { rcu_read_lock(); list_for_each_entry_rcu (ctx, &nfsi->open_files, list) atomic_inc(&ctx->cache_hits); rcu_read_unlock(); } } /* * This function is mainly for use by nfs_getattr(). * * If this is an 'ls -l', we want to force use of readdirplus. */ void nfs_readdir_record_entry_cache_miss(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && S_ISDIR(dir->i_mode)) { rcu_read_lock(); list_for_each_entry_rcu (ctx, &nfsi->open_files, list) atomic_inc(&ctx->cache_misses); rcu_read_unlock(); } } static void nfs_lookup_advise_force_readdirplus(struct inode *dir, unsigned int flags) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) return; if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL)) return; nfs_readdir_record_entry_cache_miss(dir); } static void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *inode; int status; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID)) return; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) return; if (filename.len == 0) return; /* Validate that the name doesn't contain any illegal '\0' */ if (strnlen(filename.name, filename.len) != filename.len) return; /* ...or '/' */ if (strnchr(filename.name, filename.len, '/')) return; if (filename.name[0] == '.') { if (filename.len == 1) return; if (filename.len == 2 && filename.name[1] == '.') return; } filename.hash = full_name_hash(parent, filename.name, filename.len); dentry = d_lookup(parent, &filename); again: if (!dentry) { dentry = d_alloc_parallel(parent, &filename, &wq); if (IS_ERR(dentry)) return; } if (!d_in_lookup(dentry)) { /* Is there a mountpoint here? If so, just exit */ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, &entry->fattr->fsid)) goto out; if (nfs_same_file(dentry, entry)) { if (!entry->fh->size) goto out; nfs_set_verifier(dentry, dir_verifier); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) nfs_setsecurity(d_inode(dentry), entry->fattr); trace_nfs_readdir_lookup_revalidate(d_inode(parent), dentry, 0, status); goto out; } else { trace_nfs_readdir_lookup_revalidate_failed( d_inode(parent), dentry, 0); d_invalidate(dentry); dput(dentry); dentry = NULL; goto again; } } if (!entry->fh->size) { d_lookup_done(dentry); goto out; } inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); alias = d_splice_alias(inode, dentry); d_lookup_done(dentry); if (alias) { if (IS_ERR(alias)) goto out; dput(dentry); dentry = alias; } nfs_set_verifier(dentry, dir_verifier); trace_nfs_readdir_lookup(d_inode(parent), dentry, 0); out: dput(dentry); } static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *stream) { int ret; if (entry->fattr->label) entry->fattr->label->len = NFS4_MAXLABELLEN; ret = xdr_decode(desc, entry, stream); if (ret || !desc->plus) return ret; nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); return 0; } /* Perform conversion from xdr to cache array */ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct page **xdr_pages, unsigned int buflen, struct folio **arrays, size_t narrays, u64 change_attr) { struct address_space *mapping = desc->file->f_mapping; struct folio *new, *folio = *arrays; struct xdr_stream stream; struct page *scratch; struct xdr_buf buf; u64 cookie; int status; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_page(&stream, scratch); do { status = nfs_readdir_entry_decode(desc, entry, &stream); if (status != 0) break; status = nfs_readdir_folio_array_append(folio, entry, &cookie); if (status != -ENOSPC) continue; if (folio->mapping != mapping) { if (!--narrays) break; new = nfs_readdir_folio_array_alloc(cookie, GFP_KERNEL); if (!new) break; arrays++; *arrays = folio = new; } else { new = nfs_readdir_folio_get_next(mapping, cookie, change_attr); if (!new) break; if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); folio = new; } desc->folio_index_max++; status = nfs_readdir_folio_array_append(folio, entry, &cookie); } while (!status && !entry->eof); switch (status) { case -EBADCOOKIE: if (!entry->eof) break; nfs_readdir_folio_set_eof(folio); fallthrough; case -EAGAIN: status = 0; break; case -ENOSPC: status = 0; if (!desc->plus) break; while (!nfs_readdir_entry_decode(desc, entry, &stream)) ; } if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); put_page(scratch); return status; } static void nfs_readdir_free_pages(struct page **pages, size_t npages) { while (npages--) put_page(pages[npages]); kfree(pages); } /* * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call * to nfs_readdir_free_pages() */ static struct page **nfs_readdir_alloc_pages(size_t npages) { struct page **pages; size_t i; pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); if (!pages) return NULL; for (i = 0; i < npages; i++) { struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) goto out_freepages; pages[i] = page; } return pages; out_freepages: nfs_readdir_free_pages(pages, i); return NULL; } static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, __be32 *verf_arg, __be32 *verf_res, struct folio **arrays, size_t narrays) { u64 change_attr; struct page **pages; struct folio *folio = *arrays; struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); unsigned int dtsize = desc->dtsize; unsigned int pglen; int status = -ENOMEM; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; entry->cookie = nfs_readdir_folio_last_cookie(folio); entry->fh = nfs_alloc_fhandle(); entry->fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); entry->server = NFS_SERVER(inode); if (entry->fh == NULL || entry->fattr == NULL) goto out; array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT; pages = nfs_readdir_alloc_pages(array_size); if (!pages) goto out; change_attr = inode_peek_iversion_raw(inode); status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages, dtsize, verf_res); if (status < 0) goto free_pages; pglen = status; if (pglen != 0) status = nfs_readdir_folio_filler(desc, entry, pages, pglen, arrays, narrays, change_attr); else nfs_readdir_folio_set_eof(folio); desc->buffer_fills++; free_pages: nfs_readdir_free_pages(pages, array_size); out: nfs_free_fattr(entry->fattr); nfs_free_fhandle(entry->fh); kfree(entry); return status; } static void nfs_readdir_folio_put(struct nfs_readdir_descriptor *desc) { folio_put(desc->folio); desc->folio = NULL; } static void nfs_readdir_folio_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { folio_unlock(desc->folio); nfs_readdir_folio_put(desc); } static struct folio * nfs_readdir_folio_get_cached(struct nfs_readdir_descriptor *desc) { struct address_space *mapping = desc->file->f_mapping; u64 change_attr = inode_peek_iversion_raw(mapping->host); u64 cookie = desc->last_cookie; struct folio *folio; folio = nfs_readdir_folio_get_locked(mapping, cookie, change_attr); if (!folio) return NULL; if (desc->clear_cache && !nfs_readdir_folio_needs_filling(folio)) nfs_readdir_folio_reinit_array(folio, cookie, change_attr); return folio; } /* * Returns 0 if desc->dir_cookie was found on page desc->page_index * and locks the page to prevent removal from the page cache. */ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) { struct inode *inode = file_inode(desc->file); struct nfs_inode *nfsi = NFS_I(inode); __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; desc->folio = nfs_readdir_folio_get_cached(desc); if (!desc->folio) return -ENOMEM; if (nfs_readdir_folio_needs_filling(desc->folio)) { /* Grow the dtsize if we had to go back for more pages */ if (desc->folio_index == desc->folio_index_max) nfs_grow_dtsize(desc); desc->folio_index_max = desc->folio_index; trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf, desc->last_cookie, desc->folio->index, desc->dtsize); res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, &desc->folio, 1); if (res < 0) { nfs_readdir_folio_unlock_and_put_cached(desc); trace_nfs_readdir_cache_fill_done(inode, res); if (res == -EBADCOOKIE || res == -ENOTSYNC) { invalidate_inode_pages2(desc->file->f_mapping); nfs_readdir_rewind_search(desc); trace_nfs_readdir_invalidate_cache_range( inode, 0, MAX_LFS_FILESIZE); return -EAGAIN; } return res; } /* * Set the cookie verifier if the page cache was empty */ if (desc->last_cookie == 0 && memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) { memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); invalidate_inode_pages2_range(desc->file->f_mapping, 1, -1); trace_nfs_readdir_invalidate_cache_range( inode, 1, MAX_LFS_FILESIZE); } desc->clear_cache = false; } res = nfs_readdir_search_array(desc); if (res == 0) return 0; nfs_readdir_folio_unlock_and_put_cached(desc); return res; } /* Search for desc->dir_cookie from the beginning of the page cache */ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; do { res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; } #define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL) /* * Once we've found the start of the dirent within a page: fill 'er up... */ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, const __be32 *verf) { struct file *file = desc->file; struct nfs_cache_array *array; unsigned int i; bool first_emit = !desc->dir_cookie; array = kmap_local_folio(desc->folio, 0); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; /* * nfs_readdir_handle_cache_misses return force clear at * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1 * entries need be emitted here. */ if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) { desc->eob = true; break; } ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { desc->eob = true; break; } memcpy(desc->verf, verf, sizeof(desc->verf)); if (i == array->size - 1) { desc->dir_cookie = array->last_cookie; nfs_readdir_seek_next_array(array, desc); } else { desc->dir_cookie = array->array[i + 1].cookie; desc->last_cookie = array->array[0].cookie; } if (nfs_readdir_use_cookie(file)) desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; } if (array->folio_is_eof) desc->eof = !desc->eob; kunmap_local(array); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", (unsigned long long)desc->dir_cookie); } /* * If we cannot find a cookie in our cache, we suspect that this is * because it points to a deleted file, so we ask the server to return * whatever it thinks is the next entry. We then feed this to filldir. * If all goes well, we should then be able to find our way round the * cache on the next call to readdir_search_pagecache(); * * NOTE: we cannot add the anonymous page to the pagecache because * the data it contains might not be page aligned. Besides, * we should already have a complete representation of the * directory in the page cache by the time we get here. */ static int uncached_readdir(struct nfs_readdir_descriptor *desc) { struct folio **arrays; size_t i, sz = 512; __be32 verf[NFS_DIR_VERIFIER_SIZE]; int status = -ENOMEM; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n", (unsigned long long)desc->dir_cookie); arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); if (!arrays) goto out; arrays[0] = nfs_readdir_folio_array_alloc(desc->dir_cookie, GFP_KERNEL); if (!arrays[0]) goto out; desc->folio_index = 0; desc->cache_entry_index = 0; desc->last_cookie = desc->dir_cookie; desc->folio_index_max = 0; trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie, -1, desc->dtsize); status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); if (status < 0) { trace_nfs_readdir_uncached_done(file_inode(desc->file), status); goto out_free; } for (i = 0; !desc->eob && i < sz && arrays[i]; i++) { desc->folio = arrays[i]; nfs_do_filldir(desc, verf); } desc->folio = NULL; /* * Grow the dtsize if we have to go back for more pages, * or shrink it if we're reading too many. */ if (!desc->eof) { if (!desc->eob) nfs_grow_dtsize(desc); else if (desc->buffer_fills == 1 && i < (desc->folio_index_max >> 1)) nfs_shrink_dtsize(desc); } out_free: for (i = 0; i < sz && arrays[i]; i++) nfs_readdir_folio_array_free(arrays[i]); out: if (!nfs_readdir_use_cookie(desc->file)) nfs_readdir_rewind_search(desc); desc->folio_index_max = -1; kfree(arrays); dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; } static bool nfs_readdir_handle_cache_misses(struct inode *inode, struct nfs_readdir_descriptor *desc, unsigned int cache_misses, bool force_clear) { if (desc->ctx->pos == 0 || !desc->plus) return false; if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear) return false; trace_nfs_readdir_force_readdirplus(inode); return true; } /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. */ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; unsigned int cache_hits, cache_misses; bool force_clear; int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); /* * ctx->pos points to the dirent entry number. * *desc->dir_cookie has the cookie for the next entry. We have * to either find the entry with the appropriate number or * revalidate the cookie. */ nfs_revalidate_mapping(inode, file->f_mapping); res = -ENOMEM; desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) goto out; desc->file = file; desc->ctx = ctx; desc->folio_index_max = -1; spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; desc->folio_index = dir_ctx->page_index; desc->last_cookie = dir_ctx->last_cookie; desc->attr_gencount = dir_ctx->attr_gencount; desc->eof = dir_ctx->eof; nfs_set_dtsize(desc, dir_ctx->dtsize); memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0); cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0); force_clear = dir_ctx->force_clear; spin_unlock(&file->f_lock); if (desc->eof) { res = 0; goto out_free; } desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses); force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses, force_clear); desc->clear_cache = force_clear; do { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { res = 0; /* This means either end of directory */ if (desc->dir_cookie && !desc->eof) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc); if (res == 0) continue; if (res == -EBADCOOKIE || res == -ENOTSYNC) res = 0; } break; } if (res == -ETOOSMALL && desc->plus) { nfs_zap_caches(inode); desc->plus = false; desc->eof = false; continue; } if (res < 0) break; nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_folio_unlock_and_put_cached(desc); if (desc->folio_index == desc->folio_index_max) desc->clear_cache = force_clear; } while (!desc->eob && !desc->eof); spin_lock(&file->f_lock); dir_ctx->dir_cookie = desc->dir_cookie; dir_ctx->last_cookie = desc->last_cookie; dir_ctx->attr_gencount = desc->attr_gencount; dir_ctx->page_index = desc->folio_index; dir_ctx->force_clear = force_clear; dir_ctx->eof = desc->eof; dir_ctx->dtsize = desc->dtsize; memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); out_free: kfree(desc); out: dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); return res; } static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); switch (whence) { default: return -EINVAL; case SEEK_SET: if (offset < 0) return -EINVAL; spin_lock(&filp->f_lock); break; case SEEK_CUR: if (offset == 0) return filp->f_pos; spin_lock(&filp->f_lock); offset += filp->f_pos; if (offset < 0) { spin_unlock(&filp->f_lock); return -EINVAL; } } if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->page_index = 0; if (!nfs_readdir_use_cookie(filp)) { dir_ctx->dir_cookie = 0; dir_ctx->last_cookie = 0; } else { dir_ctx->dir_cookie = offset; dir_ctx->last_cookie = offset; } dir_ctx->eof = false; } spin_unlock(&filp->f_lock); return offset; } /* * All directory operations under NFS are synchronous, so fsync() * is a dummy operation. */ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, int datasync) { dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); nfs_inc_stats(file_inode(filp), NFSIOS_VFSFSYNC); return 0; } /** * nfs_force_lookup_revalidate - Mark the directory as having changed * @dir: pointer to directory inode * * This forces the revalidation code in nfs_lookup_revalidate() to do a * full lookup on all child dentries of 'dir' whenever a change occurs * on the server that might have invalidated our dcache. * * Note that we reserve bit '0' as a tag to let us know when a dentry * was revalidated while holding a delegation on its inode. * * The caller should be holding dir->i_lock */ void nfs_force_lookup_revalidate(struct inode *dir) { NFS_I(dir)->cache_change_attribute += 2; } EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); /** * nfs_verify_change_attribute - Detects NFS remote directory changes * @dir: pointer to parent directory inode * @verf: previously saved change attribute * * Return "false" if the verifiers doesn't match the change attribute. * This would usually indicate that the directory contents have changed on * the server, and that any dentries need revalidating. */ static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf) { return (verf & ~1UL) == nfs_save_change_attribute(dir); } static void nfs_set_verifier_delegated(unsigned long *verf) { *verf |= 1UL; } #if IS_ENABLED(CONFIG_NFS_V4) static void nfs_unset_verifier_delegated(unsigned long *verf) { *verf &= ~1UL; } #endif /* IS_ENABLED(CONFIG_NFS_V4) */ static bool nfs_test_verifier_delegated(unsigned long verf) { return verf & 1; } static bool nfs_verifier_is_delegated(struct dentry *dentry) { return nfs_test_verifier_delegated(dentry->d_time); } static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) { struct inode *inode = d_inode(dentry); struct inode *dir = d_inode_rcu(dentry->d_parent); if (!dir || !nfs_verify_change_attribute(dir, verf)) return; if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ, 0)) nfs_set_verifier_delegated(&verf); dentry->d_time = verf; } /** * nfs_set_verifier - save a parent directory verifier in the dentry * @dentry: pointer to dentry * @verf: verifier to save * * Saves the parent directory verifier in @dentry. If the inode has * a delegation, we also tag the dentry as having been revalidated * while holding a delegation so that we know we don't have to * look it up again after a directory change. */ void nfs_set_verifier(struct dentry *dentry, unsigned long verf) { spin_lock(&dentry->d_lock); nfs_set_verifier_locked(dentry, verf); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL_GPL(nfs_set_verifier); #if IS_ENABLED(CONFIG_NFS_V4) /** * nfs_clear_verifier_delegated - clear the dir verifier delegation tag * @inode: pointer to inode * * Iterates through the dentries in the inode alias list and clears * the tag used to indicate that the dentry has been revalidated * while holding a delegation. * This function is intended for use when the delegation is being * returned or revoked. */ void nfs_clear_verifier_delegated(struct inode *inode) { struct dentry *alias; if (!inode) return; spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); nfs_unset_verifier_delegated(&alias->d_time); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) && d_really_is_negative(dentry)) return dentry->d_time == inode_peek_iversion_raw(dir); return nfs_verify_change_attribute(dir, dentry->d_time); } /* * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy * and may need to be looked up again. * If rcu_walk prevents us from performing a full check, return 0. */ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, int rcu_walk) { if (IS_ROOT(dentry)) return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) return 0; if (!nfs_dentry_verify_change(dir, dentry)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ if (nfs_mapping_need_revalidate_inode(dir)) { if (rcu_walk) return 0; if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) return 0; } if (!nfs_dentry_verify_change(dir, dentry)) return 0; return 1; } /* * Use intent information to check whether or not we're going to do * an O_EXCL create using this path component. */ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; return flags & LOOKUP_EXCL; } /* * Inode and filehandle revalidation for lookups. * * We force revalidation in the cases where the VFS sets LOOKUP_REVAL, * or if the intent information indicates that we're about to open this * particular file and the "nocto" mount flag is not set. * */ static int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) { struct nfs_server *server = NFS_SERVER(inode); int ret; if (IS_AUTOMOUNT(inode)) return 0; if (flags & LOOKUP_OPEN) { switch (inode->i_mode & S_IFMT) { case S_IFREG: /* A NFSv4 OPEN will revalidate later */ if (server->caps & NFS_CAP_ATOMIC_OPEN) goto out; fallthrough; case S_IFDIR: if (server->flags & NFS_MOUNT_NOCTO) break; /* NFS close-to-open cache consistency validation */ goto out_force; } } /* VFS wants an on-the-wire revalidation */ if (flags & LOOKUP_REVAL) goto out_force; out: if (inode->i_nlink > 0 || (inode->i_nlink == 0 && test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags))) return 0; else return -ESTALE; out_force: if (flags & LOOKUP_RCU) return -ECHILD; ret = __nfs_revalidate_inode(server, inode); if (ret != 0) return ret; goto out; } static void nfs_mark_dir_for_revalidate(struct inode *inode) { spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); spin_unlock(&inode->i_lock); } /* * We judge how long we want to trust negative * dentries by looking at the parent inode mtime. * * If parent mtime has changed, we revalidate, else we wait for a * period corresponding to the parent's attribute cache timeout value. * * If LOOKUP_RCU prevents us from performing a full check, return 1 * suggesting a reval is needed. * * Note that when creating a new file, or looking up a rename target, * then it shouldn't be necessary to revalidate a negative dentry. */ static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; /* Case insensitive server? Revalidate negative dentries */ if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) return 1; return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } static int nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, struct inode *inode, int error) { switch (error) { case 1: break; case -ETIMEDOUT: if (inode && (IS_ROOT(dentry) || NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)) error = 1; break; case -ESTALE: case -ENOENT: error = 0; fallthrough; default: /* * We can't d_drop the root of a disconnected tree: * its d_hash is on the s_anon list and d_drop() would hide * it from shrink_dcache_for_unmount(), leading to busy * inodes on unmount and further oopses. */ if (inode && IS_ROOT(dentry)) error = 1; break; } trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error); return error; } static int nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry, unsigned int flags) { int ret = 1; if (nfs_neg_need_reval(dir, dentry, flags)) { if (flags & LOOKUP_RCU) return -ECHILD; ret = 0; } return nfs_lookup_revalidate_done(dir, dentry, NULL, ret); } static int nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, struct inode *inode) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); return nfs_lookup_revalidate_done(dir, dentry, inode, 1); } static int nfs_lookup_revalidate_dentry(struct inode *dir, const struct qstr *name, struct dentry *dentry, struct inode *inode, unsigned int flags) { struct nfs_fh *fhandle; struct nfs_fattr *fattr; unsigned long dir_verifier; int ret; trace_nfs_lookup_revalidate_enter(dir, dentry, flags); ret = -ENOMEM; fhandle = nfs_alloc_fhandle(); fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode)); if (fhandle == NULL || fattr == NULL) goto out; dir_verifier = nfs_save_change_attribute(dir); ret = NFS_PROTO(dir)->lookup(dir, dentry, name, fhandle, fattr); if (ret < 0) goto out; /* Request help from readdirplus */ nfs_lookup_advise_force_readdirplus(dir, flags); ret = 0; if (nfs_compare_fh(NFS_FH(inode), fhandle)) goto out; if (nfs_refresh_inode(inode, fattr) < 0) goto out; nfs_setsecurity(inode, fattr); nfs_set_verifier(dentry, dir_verifier); ret = 1; out: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); /* * If the lookup failed despite the dentry change attribute being * a match, then we should revalidate the directory cache. */ if (!ret && nfs_dentry_verify_change(dir, dentry)) nfs_mark_dir_for_revalidate(dir); return nfs_lookup_revalidate_done(dir, dentry, inode, ret); } /* * This is called every time the dcache has a lookup hit, * and we should check whether we can really trust that * lookup. * * NOTE! The hit can be a negative hit too, don't assume * we have an inode! * * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ static int nfs_do_lookup_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct inode *inode; int error = 0; nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); if (!inode) return nfs_lookup_revalidate_negative(dir, dentry, flags); if (is_bad_inode(inode)) { dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); goto out_bad; } if ((flags & LOOKUP_RENAME_TARGET) && d_count(dentry) < 2 && nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) goto out_bad; if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* Force a full look up iff the parent directory has changed */ if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { error = nfs_lookup_verify_inode(inode, flags); if (error) { if (error == -ESTALE) nfs_mark_dir_for_revalidate(dir); goto out_bad; } goto out_valid; } if (flags & LOOKUP_RCU) return -ECHILD; if (NFS_STALE(inode)) goto out_bad; return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); out_valid: return nfs_lookup_revalidate_done(dir, dentry, inode, 1); out_bad: if (flags & LOOKUP_RCU) return -ECHILD; return nfs_lookup_revalidate_done(dir, dentry, inode, error); } static int __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) { if (dentry->d_fsdata == NFS_FSDATA_BLOCKED) return -ECHILD; } else { /* Wait for unlink to complete - see unblock_revalidate() */ wait_var_event(&dentry->d_fsdata, smp_load_acquire(&dentry->d_fsdata) != NFS_FSDATA_BLOCKED); } return 0; } static int nfs_lookup_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (__nfs_lookup_revalidate(dentry, flags)) return -ECHILD; return nfs_do_lookup_revalidate(dir, name, dentry, flags); } static void block_revalidate(struct dentry *dentry) { /* old devname - just in case */ kfree(dentry->d_fsdata); /* Any new reference that could lead to an open * will take ->d_lock in lookup_open() -> d_lookup(). * Holding this lock ensures we cannot race with * __nfs_lookup_revalidate() and removes and need * for further barriers. */ lockdep_assert_held(&dentry->d_lock); dentry->d_fsdata = NFS_FSDATA_BLOCKED; } static void unblock_revalidate(struct dentry *dentry) { /* store_release ensures wait_var_event() sees the update */ smp_store_release(&dentry->d_fsdata, NULL); wake_up_var(&dentry->d_fsdata); } /* * A weaker form of d_revalidate for revalidating just the d_inode(dentry) * when we don't really care about the dentry name. This is called when a * pathwalk ends on a dentry that was not found via a normal lookup in the * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). * * In this situation, we just want to verify that the inode itself is OK * since the dentry might have changed on the server. */ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode = d_inode(dentry); int error = 0; /* * I believe we can only get a negative dentry here in the case of a * procfs-style symlink. Just assume it's correct for now, but we may * eventually need to do something more here. */ if (!inode) { dfprintk(LOOKUPCACHE, "%s: %pd2 has negative inode\n", __func__, dentry); return 1; } if (is_bad_inode(inode)) { dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); return 0; } error = nfs_lookup_verify_inode(inode, flags); dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; } /* * This is called from dput() when d_count is going to 0. */ static int nfs_dentry_delete(const struct dentry *dentry) { dfprintk(VFS, "NFS: dentry_delete(%pd2, %x)\n", dentry, dentry->d_flags); /* Unhash any dentry with a stale inode */ if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry))) return 1; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { /* Unhash it, so that ->d_iput() would be called */ return 1; } if (!(dentry->d_sb->s_flags & SB_ACTIVE)) { /* Unhash it, so that ancestors of killed async unlink * files will be cleaned up during umount */ return 1; } return 0; } /* Ensure that we revalidate inode->i_nlink */ static void nfs_drop_nlink(struct inode *inode) { spin_lock(&inode->i_lock); /* drop the inode if we're reasonably sure this is the last link */ if (inode->i_nlink > 0) drop_nlink(inode); NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); nfs_set_cache_invalid( inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_NLINK); spin_unlock(&inode->i_lock); } /* * Called when the dentry loses inode. * We use it to clean up silly-renamed files. */ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) { if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { nfs_complete_unlink(dentry, inode); nfs_drop_nlink(inode); } iput(inode); } static void nfs_d_release(struct dentry *dentry) { /* free cached devname value, if it survived that far */ if (unlikely(dentry->d_fsdata)) { if (dentry->d_flags & DCACHE_NFSFS_RENAMED) WARN_ON(1); else kfree(dentry->d_fsdata); } } const struct dentry_operations nfs_dentry_operations = { .d_revalidate = nfs_lookup_revalidate, .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, .d_release = nfs_d_release, }; EXPORT_SYMBOL_GPL(nfs_dentry_operations); struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *res; struct inode *inode = NULL; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; unsigned long dir_verifier; int error; dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); nfs_inc_stats(dir, NFSIOS_VFSLOOKUP); if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen)) return ERR_PTR(-ENAMETOOLONG); /* * If we're doing an exclusive create, optimize away the lookup * but don't hash the dentry. */ if (nfs_is_exclusive_create(dir, flags) || flags & LOOKUP_RENAME_TARGET) return NULL; res = ERR_PTR(-ENOMEM); fhandle = nfs_alloc_fhandle(); fattr = nfs_alloc_fattr_with_label(NFS_SERVER(dir)); if (fhandle == NULL || fattr == NULL) goto out; dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, fhandle, fattr); if (error == -ENOENT) { if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) dir_verifier = inode_peek_iversion_raw(dir); goto no_entry; } if (error < 0) { res = ERR_PTR(error); goto out; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr); res = ERR_CAST(inode); if (IS_ERR(res)) goto out; /* Notify readdir to use READDIRPLUS */ nfs_lookup_advise_force_readdirplus(dir, flags); no_entry: res = d_splice_alias(inode, dentry); if (res != NULL) { if (IS_ERR(res)) goto out; dentry = res; } nfs_set_verifier(dentry, dir_verifier); out: trace_nfs_lookup_exit(dir, dentry, flags, PTR_ERR_OR_ZERO(res)); nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); return res; } EXPORT_SYMBOL_GPL(nfs_lookup); void nfs_d_prune_case_insensitive_aliases(struct inode *inode) { /* Case insensitive server? Revalidate dentries */ if (inode && nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE)) d_prune_aliases(inode); } EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); #if IS_ENABLED(CONFIG_NFS_V4) static int nfs4_lookup_revalidate(struct inode *, const struct qstr *, struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, .d_release = nfs_d_release, }; EXPORT_SYMBOL_GPL(nfs4_dentry_operations); static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp) { return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp); } static int do_open(struct inode *inode, struct file *filp) { nfs_fscache_open_file(inode, filp); return 0; } static int nfs_finish_open(struct nfs_open_context *ctx, struct dentry *dentry, struct file *file, unsigned open_flags) { int err; err = finish_open(file, dentry, do_open); if (err) goto out; if (S_ISREG(file_inode(file)->i_mode)) nfs_file_set_open_context(file, ctx); else err = -EOPENSTALE; out: return err; } int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; unsigned long dir_verifier; bool switched = false; int created = 0; int err; /* Expect a negative dentry */ BUG_ON(d_inode(dentry)); dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); err = nfs_check_flags(open_flags); if (err) return err; /* NFS only supports OPEN on regular files */ if ((open_flags & O_DIRECTORY)) { if (!d_in_lookup(dentry)) { /* * Hashed negative dentry with O_DIRECTORY: dentry was * revalidated and is fine, no need to perform lookup * again */ return -ENOENT; } lookup_flags = LOOKUP_OPEN|LOOKUP_DIRECTORY; goto no_open; } if (dentry->d_name.len > NFS_SERVER(dir)->namelen) return -ENAMETOOLONG; if (open_flags & O_CREAT) { struct nfs_server *server = NFS_SERVER(dir); if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) mode &= ~current_umask(); attr.ia_valid |= ATTR_MODE; attr.ia_mode = mode; } if (open_flags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; } if (!(open_flags & O_CREAT) && !d_in_lookup(dentry)) { d_drop(dentry); switched = true; dentry = d_alloc_parallel(dentry->d_parent, &dentry->d_name, &wq); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (unlikely(!d_in_lookup(dentry))) return finish_no_open(file, dentry); } ctx = create_nfs_open_context(dentry, open_flags, file); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; trace_nfs_atomic_open_enter(dir, ctx, open_flags); inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr, &created); if (created) file->f_mode |= FMODE_CREATED; if (IS_ERR(inode)) { err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); d_drop(dentry); switch (err) { case -ENOENT: d_splice_alias(NULL, dentry); if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) dir_verifier = inode_peek_iversion_raw(dir); else dir_verifier = nfs_save_change_attribute(dir); nfs_set_verifier(dentry, dir_verifier); break; case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(open_flags & O_NOFOLLOW)) goto no_open; break; /* case -EINVAL: */ default: break; } goto out; } file->f_mode |= FMODE_CAN_ODIRECT; err = nfs_finish_open(ctx, ctx->dentry, file, open_flags); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); out: if (unlikely(switched)) { d_lookup_done(dentry); dput(dentry); } return err; no_open: res = nfs_lookup(dir, dentry, lookup_flags); if (!res) { inode = d_inode(dentry); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) res = ERR_PTR(-ENOTDIR); else if (inode && S_ISREG(inode->i_mode)) res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) { dput(res); res = ERR_PTR(-ENOTDIR); } else if (inode && S_ISREG(inode->i_mode)) { dput(res); res = ERR_PTR(-EOPENSTALE); } } if (switched) { d_lookup_done(dentry); if (!res) res = dentry; else dput(dentry); } if (IS_ERR(res)) return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); static int nfs4_lookup_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct inode *inode; if (__nfs_lookup_revalidate(dentry, flags)) return -ECHILD; trace_nfs_lookup_revalidate_enter(dir, dentry, flags); if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) goto full_reval; if (d_mountpoint(dentry)) goto full_reval; inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ if (inode == NULL) goto full_reval; if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) goto full_reval; /* We cannot do exclusive creation on a positive dentry */ if (flags & (LOOKUP_EXCL | LOOKUP_REVAL)) goto reval_dentry; /* Check if the directory changed */ if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) goto reval_dentry; /* Let f_op->open() actually open (and revalidate) the file */ return 1; reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; return nfs_lookup_revalidate_dentry(dir, name, dentry, inode, flags); full_reval: return nfs_do_lookup_revalidate(dir, name, dentry, flags); } #endif /* CONFIG_NFSV4 */ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, umode_t mode) { /* Same as look+open from lookup_open(), but with different O_TRUNC * handling. */ int error = 0; if (dentry->d_name.len > NFS_SERVER(dir)->namelen) return -ENAMETOOLONG; if (open_flags & O_CREAT) { file->f_mode |= FMODE_CREATED; error = nfs_do_create(dir, dentry, mode, open_flags); if (error) return error; return finish_open(file, dentry, NULL); } else if (d_in_lookup(dentry)) { /* The only flags nfs_lookup considers are * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and * we want those to be zero so the lookup isn't skipped. */ struct dentry *res = nfs_lookup(dir, dentry, 0); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) return PTR_ERR(res); return finish_no_open(file, res); } } return finish_no_open(file, NULL); } EXPORT_SYMBOL_GPL(nfs_atomic_open_v23); struct dentry * nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct dentry *parent = dget_parent(dentry); struct inode *dir = d_inode(parent); struct inode *inode; struct dentry *d; int error; d_drop(dentry); if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, dentry, &dentry->d_name, fhandle, fattr); if (error) goto out_error; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); if (!(fattr->valid & NFS_ATTR_FATTR)) { struct nfs_server *server = NFS_SB(dentry->d_sb); error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL); if (error < 0) goto out_error; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr); d = d_splice_alias(inode, dentry); out: dput(parent); return d; out_error: d = ERR_PTR(error); goto out; } EXPORT_SYMBOL_GPL(nfs_add_or_obtain); /* * Code common to create, mkdir, and mknod. */ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct dentry *d; d = nfs_add_or_obtain(dentry, fhandle, fattr); if (IS_ERR(d)) return PTR_ERR(d); /* Callers don't care */ dput(d); return 0; } EXPORT_SYMBOL_GPL(nfs_instantiate); /* * Following a failed create operation, we drop the dentry rather * than retain a negative dentry. This avoids a problem in the event * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ static int nfs_do_create(struct inode *dir, struct dentry *dentry, umode_t mode, int open_flags) { struct iattr attr; int error; open_flags |= O_CREAT; dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; if (open_flags & O_TRUNC) { attr.ia_size = 0; attr.ia_valid |= ATTR_SIZE; } trace_nfs_create_enter(dir, dentry, open_flags); error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags); trace_nfs_create_exit(dir, dentry, open_flags, error); if (error != 0) goto out_err; return 0; out_err: d_drop(dentry); return error; } int nfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0); } EXPORT_SYMBOL_GPL(nfs_create); /* * See comments for nfs_proc_create regarding failed operations. */ int nfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; int status; dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; attr.ia_valid = ATTR_MODE; trace_nfs_mknod_enter(dir, dentry); status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); trace_nfs_mknod_exit(dir, dentry, status); if (status != 0) goto out_err; return 0; out_err: d_drop(dentry); return status; } EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct iattr attr; int error; dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_valid = ATTR_MODE; attr.ia_mode = mode | S_IFDIR; trace_nfs_mkdir_enter(dir, dentry); error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); trace_nfs_mkdir_exit(dir, dentry, error); if (error != 0) goto out_err; return 0; out_err: d_drop(dentry); return error; } EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { if (simple_positive(dentry)) d_delete(dentry); } static void nfs_dentry_remove_handle_error(struct inode *dir, struct dentry *dentry, int error) { switch (error) { case -ENOENT: if (d_really_is_positive(dentry)) d_delete(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; case 0: nfs_d_prune_case_insensitive_aliases(d_inode(dentry)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } } int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); if (d_really_is_positive(dentry)) { down_write(&NFS_I(d_inode(dentry))->rmdir_sem); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ switch (error) { case 0: clear_nlink(d_inode(dentry)); break; case -ENOENT: nfs_dentry_handle_enoent(dentry); } up_write(&NFS_I(d_inode(dentry))->rmdir_sem); } else error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); nfs_dentry_remove_handle_error(dir, dentry, error); trace_nfs_rmdir_exit(dir, dentry, error); return error; } EXPORT_SYMBOL_GPL(nfs_rmdir); /* * Remove a file after making sure there are no pending writes, * and after checking that the file has only one user. * * We invalidate the attribute cache and free the inode prior to the operation * to avoid possible races if the server reuses the inode. */ static int nfs_safe_remove(struct dentry *dentry) { struct inode *dir = d_inode(dentry->d_parent); struct inode *inode = d_inode(dentry); int error = -EBUSY; dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry); /* If the dentry was sillyrenamed, we simply call d_delete() */ if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { error = 0; goto out; } trace_nfs_remove_enter(dir, dentry); if (inode != NULL) { error = NFS_PROTO(dir)->remove(dir, dentry); if (error == 0) nfs_drop_nlink(inode); } else error = NFS_PROTO(dir)->remove(dir, dentry); if (error == -ENOENT) nfs_dentry_handle_enoent(dentry); trace_nfs_remove_exit(dir, dentry, error); out: return error; } /* We do silly rename. In case sillyrename() returns -EBUSY, the inode * belongs to an active ".nfs..." file and we return -EBUSY. * * If sillyrename() returns 0, we do nothing, otherwise we unlink. */ int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_unlink_enter(dir, dentry); spin_lock(&dentry->d_lock); if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(d_inode(dentry))->flags)) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ write_inode_now(d_inode(dentry), 0); error = nfs_sillyrename(dir, dentry); goto out; } /* We must prevent any concurrent open until the unlink * completes. ->d_revalidate will wait for ->d_fsdata * to clear. We set it here to ensure no lookup succeeds until * the unlink is complete on the server. */ error = -ETXTBSY; if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) || WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) { spin_unlock(&dentry->d_lock); goto out; } block_revalidate(dentry); spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); nfs_dentry_remove_handle_error(dir, dentry, error); unblock_revalidate(dentry); out: trace_nfs_unlink_exit(dir, dentry, error); return error; } EXPORT_SYMBOL_GPL(nfs_unlink); /* * To create a symbolic link, most file systems instantiate a new inode, * add a page to it containing the path, then write it out to the disk * using prepare_write/commit_write. * * Unfortunately the NFS client can't create the in-core inode first * because it needs a file handle to create an in-core inode (see * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the * symlink request has completed on the server. * * So instead we allocate a raw page, copy the symname into it, then do * the SYMLINK request with the page as the buffer. If it succeeds, we * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct folio *folio; char *kaddr; struct iattr attr; unsigned int pathlen = strlen(symname); int error; dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry, symname); if (pathlen > PAGE_SIZE) return -ENAMETOOLONG; attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; folio = folio_alloc(GFP_USER, 0); if (!folio) return -ENOMEM; kaddr = folio_address(folio); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); trace_nfs_symlink_enter(dir, dentry); error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); folio_put(folio); return error; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); /* * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0, GFP_KERNEL) == 0) { folio_mark_uptodate(folio); folio_unlock(folio); } folio_put(folio); return 0; } EXPORT_SYMBOL_GPL(nfs_symlink); int nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int error; dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", old_dentry, dentry); trace_nfs_link_enter(inode, dir, dentry); d_drop(dentry); if (S_ISREG(inode->i_mode)) nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); ihold(inode); d_add(dentry, inode); } trace_nfs_link_exit(inode, dir, dentry, error); return error; } EXPORT_SYMBOL_GPL(nfs_link); static void nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) { struct dentry *new_dentry = data->new_dentry; unblock_revalidate(new_dentry); } /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a * different file handle for the same inode after a rename (e.g. when * moving to a different directory). A fail-safe method to do so would * be to look up old_dir/old_name, create a link to new_dir/new_name and * rename the old file using the sillyrename stuff. This way, the original * file in old_dir will go away when the last process iput()s the inode. * * FIXED. * * It actually works quite well. One needs to have the possibility for * at least one ".nfs..." file in each directory the file ever gets * moved or linked to which happens automagically with the new * implementation that only depends on the dcache stuff instead of * using the inode layer * * Unfortunately, things are a little more complicated than indicated * above. For a cross-directory move, we want to make sure we can get * rid of the old inode after the operation. This means there must be * no pending writes (if it's a file), and the use count must be 1. * If these conditions are met, we can drop the dentries before doing * the rename. */ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct dentry *dentry = NULL; struct rpc_task *task; bool must_unblock = false; int error = -EBUSY; if (flags) return -EINVAL; dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n", old_dentry, new_dentry, d_count(new_dentry)); trace_nfs_rename_enter(old_dir, old_dentry, new_dir, new_dentry); /* * For non-directories, check whether the target is busy and if so, * make a copy of the dentry and then do a silly-rename. If the * silly-rename succeeds, the copied dentry is hashed and becomes * the new target. */ if (new_inode && !S_ISDIR(new_inode->i_mode)) { /* We must prevent any concurrent open until the unlink * completes. ->d_revalidate will wait for ->d_fsdata * to clear. We set it here to ensure no lookup succeeds until * the unlink is complete on the server. */ error = -ETXTBSY; if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) goto out; spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { int err; spin_unlock(&new_dentry->d_lock); /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, &new_dentry->d_name); if (!dentry) goto out; /* silly-rename the existing target ... */ err = nfs_sillyrename(new_dir, new_dentry); if (err) goto out; new_dentry = dentry; new_inode = NULL; } else { block_revalidate(new_dentry); must_unblock = true; spin_unlock(&new_dentry->d_lock); } } if (S_ISREG(old_inode->i_mode)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { if (must_unblock) unblock_revalidate(new_dentry); error = PTR_ERR(task); goto out; } error = rpc_wait_for_completion_task(task); if (error != 0) { ((struct nfs_renamedata *)task->tk_calldata)->cancelled = 1; /* Paired with the atomic_dec_and_test() barrier in rpc_do_put_task() */ smp_wmb(); } else error = task->tk_status; rpc_put_task(task); /* Ensure the inode attributes are revalidated */ if (error == 0) { spin_lock(&old_inode->i_lock); NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter(); nfs_set_cache_invalid(old_inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); spin_unlock(&old_inode->i_lock); } out: trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); if (!error) { if (new_inode != NULL) nfs_drop_nlink(new_inode); /* * The d_move() should be here instead of in an async RPC completion * handler because we need the proper locks to move the dentry. If * we're interrupted by a signal, the async RPC completion handler * should mark the directories for revalidation. */ d_move(old_dentry, new_dentry); nfs_set_verifier(old_dentry, nfs_save_change_attribute(new_dir)); } else if (error == -ENOENT) nfs_dentry_handle_enoent(old_dentry); /* new dentry created? */ if (dentry) dput(dentry); return error; } EXPORT_SYMBOL_GPL(nfs_rename); static DEFINE_SPINLOCK(nfs_access_lru_lock); static LIST_HEAD(nfs_access_lru_list); static atomic_long_t nfs_access_nr_entries; static unsigned long nfs_access_max_cachesize = 4*1024*1024; module_param(nfs_access_max_cachesize, ulong, 0644); MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length"); static void nfs_access_free_entry(struct nfs_access_entry *entry) { put_group_info(entry->group_info); kfree_rcu(entry, rcu_head); smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); smp_mb__after_atomic(); } static void nfs_access_free_list(struct list_head *head) { struct nfs_access_entry *cache; while (!list_empty(head)) { cache = list_entry(head->next, struct nfs_access_entry, lru); list_del(&cache->lru); nfs_access_free_entry(cache); } } static unsigned long nfs_do_access_cache_scan(unsigned int nr_to_scan) { LIST_HEAD(head); struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; long freed = 0; spin_lock(&nfs_access_lru_lock); list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { struct inode *inode; if (nr_to_scan-- == 0) break; inode = &nfsi->vfs_inode; spin_lock(&inode->i_lock); if (list_empty(&nfsi->access_cache_entry_lru)) goto remove_lru_entry; cache = list_entry(nfsi->access_cache_entry_lru.next, struct nfs_access_entry, lru); list_move(&cache->lru, &head); rb_erase(&cache->rb_node, &nfsi->access_cache); freed++; if (!list_empty(&nfsi->access_cache_entry_lru)) list_move_tail(&nfsi->access_cache_inode_lru, &nfs_access_lru_list); else { remove_lru_entry: list_del_init(&nfsi->access_cache_inode_lru); smp_mb__before_atomic(); clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); smp_mb__after_atomic(); } spin_unlock(&inode->i_lock); } spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); return freed; } unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { int nr_to_scan = sc->nr_to_scan; gfp_t gfp_mask = sc->gfp_mask; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; return nfs_do_access_cache_scan(nr_to_scan); } unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); } static void nfs_access_cache_enforce_limit(void) { long nr_entries = atomic_long_read(&nfs_access_nr_entries); unsigned long diff; unsigned int nr_to_scan; if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize) return; nr_to_scan = 100; diff = nr_entries - nfs_access_max_cachesize; if (diff < nr_to_scan) nr_to_scan = diff; nfs_do_access_cache_scan(nr_to_scan); } static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) { struct rb_root *root_node = &nfsi->access_cache; struct rb_node *n; struct nfs_access_entry *entry; /* Unhook entries from the cache */ while ((n = rb_first(root_node)) != NULL) { entry = rb_entry(n, struct nfs_access_entry, rb_node); rb_erase(n, root_node); list_move(&entry->lru, head); } nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; } void nfs_access_zap_cache(struct inode *inode) { LIST_HEAD(head); if (test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags) == 0) return; /* Remove from global LRU init */ spin_lock(&nfs_access_lru_lock); if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) list_del_init(&NFS_I(inode)->access_cache_inode_lru); spin_lock(&inode->i_lock); __nfs_access_zap_cache(NFS_I(inode), &head); spin_unlock(&inode->i_lock); spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); } EXPORT_SYMBOL_GPL(nfs_access_zap_cache); static int access_cmp(const struct cred *a, const struct nfs_access_entry *b) { struct group_info *ga, *gb; int g; if (uid_lt(a->fsuid, b->fsuid)) return -1; if (uid_gt(a->fsuid, b->fsuid)) return 1; if (gid_lt(a->fsgid, b->fsgid)) return -1; if (gid_gt(a->fsgid, b->fsgid)) return 1; ga = a->group_info; gb = b->group_info; if (ga == gb) return 0; if (ga == NULL) return -1; if (gb == NULL) return 1; if (ga->ngroups < gb->ngroups) return -1; if (ga->ngroups > gb->ngroups) return 1; for (g = 0; g < ga->ngroups; g++) { if (gid_lt(ga->gid[g], gb->gid[g])) return -1; if (gid_gt(ga->gid[g], gb->gid[g])) return 1; } return 0; } static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred) { struct rb_node *n = NFS_I(inode)->access_cache.rb_node; while (n != NULL) { struct nfs_access_entry *entry = rb_entry(n, struct nfs_access_entry, rb_node); int cmp = access_cmp(cred, entry); if (cmp < 0) n = n->rb_left; else if (cmp > 0) n = n->rb_right; else return entry; } return NULL; } static u64 nfs_access_login_time(const struct task_struct *task, const struct cred *cred) { const struct task_struct *parent; const struct cred *pcred; u64 ret; rcu_read_lock(); for (;;) { parent = rcu_dereference(task->real_parent); pcred = __task_cred(parent); if (parent == task || cred_fscmp(pcred, cred) != 0) break; task = parent; } ret = task->start_time; rcu_read_unlock(); return ret; } static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; bool retry = true; int err; spin_lock(&inode->i_lock); for(;;) { if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out_zap; cache = nfs_access_search_rbtree(inode, cred); err = -ENOENT; if (cache == NULL) goto out; /* Found an entry, is our attribute cache valid? */ if (!nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) break; if (!retry) break; err = -ECHILD; if (!may_block) goto out; spin_unlock(&inode->i_lock); err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (err) return err; spin_lock(&inode->i_lock); retry = false; } err = -ENOENT; if ((s64)(login_time - cache->timestamp) > 0) goto out; *mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); err = 0; out: spin_unlock(&inode->i_lock); return err; out_zap: spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); return -ENOENT; } static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask) { /* Only check the most recently returned cache entry, * but do it without locking. */ struct nfs_inode *nfsi = NFS_I(inode); u64 login_time = nfs_access_login_time(current, cred); struct nfs_access_entry *cache; int err = -ECHILD; struct list_head *lh; rcu_read_lock(); if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) goto out; lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || access_cmp(cred, cache) != 0) cache = NULL; if (cache == NULL) goto out; if ((s64)(login_time - cache->timestamp) > 0) goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; *mask = cache->mask; err = 0; out: rcu_read_unlock(); return err; } int nfs_access_get_cached(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { int status; status = nfs_access_get_cached_rcu(inode, cred, mask); if (status != 0) status = nfs_access_get_cached_locked(inode, cred, mask, may_block); return status; } EXPORT_SYMBOL_GPL(nfs_access_get_cached); static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set, const struct cred *cred) { struct nfs_inode *nfsi = NFS_I(inode); struct rb_root *root_node = &nfsi->access_cache; struct rb_node **p = &root_node->rb_node; struct rb_node *parent = NULL; struct nfs_access_entry *entry; int cmp; spin_lock(&inode->i_lock); while (*p != NULL) { parent = *p; entry = rb_entry(parent, struct nfs_access_entry, rb_node); cmp = access_cmp(cred, entry); if (cmp < 0) p = &parent->rb_left; else if (cmp > 0) p = &parent->rb_right; else goto found; } rb_link_node(&set->rb_node, parent, p); rb_insert_color(&set->rb_node, root_node); list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); spin_unlock(&inode->i_lock); return; found: rb_replace_node(parent, &set->rb_node, root_node); list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); list_del(&entry->lru); spin_unlock(&inode->i_lock); nfs_access_free_entry(entry); } void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set, const struct cred *cred) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); cache->fsuid = cred->fsuid; cache->fsgid = cred->fsgid; cache->group_info = get_group_info(cred->group_info); cache->mask = set->mask; cache->timestamp = ktime_get_ns(); /* The above field assignments must be visible * before this item appears on the lru. We cannot easily * use rcu_assign_pointer, so just force the memory barrier. */ smp_wmb(); nfs_access_add_rbtree(inode, cache, cred); /* Update accounting */ smp_mb__before_atomic(); atomic_long_inc(&nfs_access_nr_entries); smp_mb__after_atomic(); /* Add inode to global LRU list */ if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) { spin_lock(&nfs_access_lru_lock); if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); spin_unlock(&nfs_access_lru_lock); } nfs_access_cache_enforce_limit(); } EXPORT_SYMBOL_GPL(nfs_access_add_cache); #define NFS_MAY_READ (NFS_ACCESS_READ) #define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \ NFS_ACCESS_EXTEND | \ NFS_ACCESS_DELETE) #define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \ NFS_ACCESS_EXTEND) #define NFS_DIR_MAY_WRITE NFS_MAY_WRITE #define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP) #define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE) static int nfs_access_calc_mask(u32 access_result, umode_t umode) { int mask = 0; if (access_result & NFS_MAY_READ) mask |= MAY_READ; if (S_ISDIR(umode)) { if ((access_result & NFS_DIR_MAY_WRITE) == NFS_DIR_MAY_WRITE) mask |= MAY_WRITE; if ((access_result & NFS_MAY_LOOKUP) == NFS_MAY_LOOKUP) mask |= MAY_EXEC; } else if (S_ISREG(umode)) { if ((access_result & NFS_FILE_MAY_WRITE) == NFS_FILE_MAY_WRITE) mask |= MAY_WRITE; if ((access_result & NFS_MAY_EXECUTE) == NFS_MAY_EXECUTE) mask |= MAY_EXEC; } else if (access_result & NFS_MAY_WRITE) mask |= MAY_WRITE; return mask; } void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) { entry->mask = access_result; } EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) { struct nfs_access_entry cache; bool may_block = (mask & MAY_NOT_BLOCK) == 0; int cache_mask = -1; int status; trace_nfs_access_enter(inode); status = nfs_access_get_cached(inode, cred, &cache.mask, may_block); if (status == 0) goto out_cached; status = -ECHILD; if (!may_block) goto out; /* * Determine which access bits we want to ask for... */ cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND | nfs_access_xattr_mask(NFS_SERVER(inode)); if (S_ISDIR(inode->i_mode)) cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; else cache.mask |= NFS_ACCESS_EXECUTE; status = NFS_PROTO(inode)->access(inode, &cache, cred); if (status != 0) { if (status == -ESTALE) { if (!S_ISDIR(inode->i_mode)) nfs_set_inode_stale(inode); else nfs_zap_caches(inode); } goto out; } nfs_access_add_cache(inode, &cache, cred); out_cached: cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode); if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) status = -EACCES; out: trace_nfs_access_exit(inode, mask, cache_mask, status); return status; } static int nfs_open_permission_mask(int openflags) { int mask = 0; if (openflags & __FMODE_EXEC) { /* ONLY check exec rights */ mask = MAY_EXEC; } else { if ((openflags & O_ACCMODE) != O_WRONLY) mask |= MAY_READ; if ((openflags & O_ACCMODE) != O_RDONLY) mask |= MAY_WRITE; } return mask; } int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags) { return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); } EXPORT_SYMBOL_GPL(nfs_may_open); static int nfs_execute_ok(struct inode *inode, int mask) { struct nfs_server *server = NFS_SERVER(inode); int ret = 0; if (S_ISDIR(inode->i_mode)) return 0; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) { if (mask & MAY_NOT_BLOCK) return -ECHILD; ret = __nfs_revalidate_inode(server, inode); } if (ret == 0 && !execute_ok(inode)) ret = -EACCES; return ret; } int nfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { const struct cred *cred = current_cred(); int res = 0; nfs_inc_stats(inode, NFSIOS_VFSACCESS); if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) goto out; /* Is this sys_access() ? */ if (mask & (MAY_ACCESS | MAY_CHDIR)) goto force_lookup; switch (inode->i_mode & S_IFMT) { case S_IFLNK: goto out; case S_IFREG: if ((mask & MAY_OPEN) && nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)) return 0; break; case S_IFDIR: /* * Optimize away all write operations, since the server * will check permissions when we perform the op. */ if ((mask & MAY_WRITE) && !(mask & MAY_READ)) goto out; } force_lookup: if (!NFS_PROTO(inode)->access) goto out_notsup; res = nfs_do_access(inode, cred, mask); out: if (!res && (mask & MAY_EXEC)) res = nfs_execute_ok(inode, mask); dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: if (mask & MAY_NOT_BLOCK) return -ECHILD; res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER); if (res == 0) res = generic_permission(&nop_mnt_idmap, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission);
4 4 25 2 10255 25 16 16 20 20 20 20 24 1 1 16661 16661 26 3707 3705 3712 3709 3718 287 30 26 19 18 25 21 25 6 25 20 26 26 23 26 24 25 25 24 25 25 689 688 684 686 686 690 14919 14922 14916 14925 14927 14928 3691 3691 3697 3688 3688 358 360 359 96 457 457 457 457 456 242 245 243 245 244 248 249 777 780 19009 2 2 2 2 2 2 2 2 7 7 7 4 4 4 7 18 18 18 1 17 2 16 6 20 21 21 21 21 5 10 10 4 12 15 6 12 15 12 15 15 1 15 1 1 3 3 3 15 15 15 15 3 3 3 3 15 15 15 5 655 655 9991 9986 9991 10017 5031 42 42 42 42 42 42 42 24 25 23 23 23 20 23 23 23 23 6 5 2 25 26 6 29 20 20 19 6 13 26 14 30 31 26 18 5 13 17 30 16 12 24 6 23 19 24 15 23 6 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 // SPDX-License-Identifier: GPL-2.0 /* * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> #include <linux/audit.h> #include <linux/random.h> #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) #define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ TK_ADV_TICK, /* Update timekeeper on a direct frequency change */ TK_ADV_FREQ }; /* * The most important data for readout fits into a single 64 byte * cache line. */ struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; struct timekeeper shadow_timekeeper; raw_spinlock_t lock; } ____cacheline_aligned; static struct tk_data tk_core; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit * is the index for the tk_read_base array * @base: tk_read_base array. Access is indexed by the lowest bit of * @seq. * * See @update_fast_timekeeper() below. */ struct tk_fast { seqcount_latch_t seq; struct tk_read_base base[2]; }; /* Suspend-time cycles value for halted fast timekeeper. */ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { if (timekeeping_suspended) return cycles_at_suspend; return local_clock(); } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; /* * Boot time initialization which allows local_clock() to be utilized * during early boot when clocksources are not available. local_clock() * returns nanoseconds already so no conversion is required, hence mult=1 * and shift=0. When the first proper clocksource is installed then * the fast time keepers are updated with the correct values. */ #define FAST_TK_INIT \ { \ .clock = &dummy_clock, \ .mask = CLOCKSOURCE_MASK(64), \ .mult = 1, \ .shift = 0, \ } static struct tk_fast tk_fast_mono ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; raw_spin_lock_irqsave(&tk_core.lock, flags); return flags; } void timekeeper_unlock_irqrestore(unsigned long flags) { raw_spin_unlock_irqrestore(&tk_core.lock, flags); } /* * Multigrain timestamps require tracking the latest fine-grained timestamp * that has been issued, and never returning a coarse-grained timestamp that is * earlier than that value. * * mg_floor represents the latest fine-grained time that has been handed out as * a file timestamp on the system. This is tracked as a monotonic ktime_t, and * converted to a realtime clock value on an as-needed basis. * * Maintaining mg_floor ensures the multigrain interfaces never issue a * timestamp earlier than one that has been previously issued. * * The exception to this rule is when there is a backward realtime clock jump. If * such an event occurs, a timestamp can appear to be earlier than a previous one. */ static __cacheline_aligned_in_smp atomic64_t mg_floor; static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; tk->raw_sec++; } } static inline struct timespec64 tk_xtime(const struct timekeeper *tk) { struct timespec64 ts; ts.tv_sec = tk->xtime_sec; ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); return ts; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. */ tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ static inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } /** * tk_setup_internals - Set up internals to use clocksource clock. * * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment * pair and interval request. * * Unless you're the timekeeping code, you should not be using this! */ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { u64 interval; u64 tmp, ntpinterval; struct clocksource *old_clock; ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.mask = clock->mask; tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; interval = (u64) tmp; tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; tk->tkr_raw.xtime_nsec >>= -shift_change; } else { tk->tkr_mono.xtime_nsec <<= shift_change; tk->tkr_raw.xtime_nsec <<= shift_change; } } tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) { return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; /* * This detects both negative motion and the case where the delta * overflows the multiplication with tkr->mult. */ if (unlikely(delta > tkr->clock->max_cycles)) { /* * Handle clocksource inconsistency between CPUs to prevent * time from going backwards by checking for the MSB of the * mask being set in the delta. */ if (delta & ~(mask >> 1)) return tkr->xtime_nsec >> tkr->shift; return delta_to_ns_safe(tkr, delta); } return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update * @tkf: Pointer to NMI safe timekeeper * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_fast *tkf) { struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += timekeeping_get_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } /** * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic * * This timestamp is not guaranteed to be monotonic across an update. * The timestamp is calculated by: * * now = base_mono + clock_delta * slope * * So if the update lowers the slope, readers who are forced to the * not yet updated second array are still using the old steeper slope. * * tmono * ^ * | o n * | o n * | u * | o * |o * |12345678---> reader order * * o = old slope * u = update * n = new slope * * So reader 6 will observe time going backwards versus reader 5. * * While other CPUs are likely to be able to observe that, the only way * for a CPU local observation is when an NMI hits in the middle of * the update. Timestamps taken from that NMI context might be ahead * of the following timestamps. Callers need to be aware of that and * deal with it. */ u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); /** * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw * * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); /** * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset * is added to the old timekeeping making the clock appear to update slightly * earlier: * CPU 0 CPU 1 * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. * * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); /** * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. * * The same limitations as described for ktime_get_boot_fast_ns() apply. The * mono time and the TAI offset are not read atomically which may yield wrong * readouts. However, an update of the TAI offset is an rare event e.g., caused * by settime or adjtimex with an offset. The user of this function has to deal * with the possibility of wrong timestamps in post processing. */ u64 notrace ktime_get_tai_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. * * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. */ u64 ktime_get_real_fast_ns(void) { struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return baser + delta; } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * * It generally is unsafe to access the clocksource after timekeeping has been * suspended, so take a snapshot of the readout base of @tk and use it as the * fast timekeeper's readout base while suspended. It will return the same * number of cycles every time until timekeeping is resumed at which time the * proper readout base for the fast timekeeper will be restored automatically. */ static void halt_fast_timekeeper(const struct timekeeper *tk) { static struct tk_read_base tkr_dummy; const struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener * @nb: Pointer to the notifier block to register */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; int ret; guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener * @nb: Pointer to the notifier block to unregister */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { guard(raw_spinlock_irqsave)(&tk_core.lock); return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* * tk_update_leap_state - helper to update the next_leap_ktime */ static inline void tk_update_leap_state(struct timekeeper *tk) { tk->next_leap_ktime = ntp_get_next_leap(); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } /* * Leap state update for both shadow and the real timekeeper * Separate to spare a full memcpy() of the timekeeper. */ static void tk_update_leap_state_all(struct tk_data *tkd) { write_seqcount_begin(&tkd->seq); tk_update_leap_state(&tkd->shadow_timekeeper); tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; write_seqcount_end(&tkd->seq); } /* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) { u64 seconds; u32 nsec; /* * The xtime based monotonic readout is: * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); * The ktime based monotonic readout is: * nsec = base_mono + now(); * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec */ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take * this into account before updating tk->ktime_sec. */ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; /* Update the monotonic raw base */ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* * Restore the shadow timekeeper from the real timekeeper. */ static void timekeeping_restore_shadow(struct tk_data *tkd) { lockdep_assert_held(&tkd->lock); memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); } static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { struct timekeeper *tk = &tk_core.shadow_timekeeper; lockdep_assert_held(&tkd->lock); /* * Block out readers before running the updates below because that * updates VDSO and other time related infrastructure. Not blocking * the readers might let a reader see time going backwards when * reading from the VDSO after the VDSO update and then reading in * the kernel from the timekeeper before that got updated. */ write_seqcount_begin(&tkd->seq); if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); } tk_update_leap_state(tk); tk_update_ktime_data(tk); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; /* * Update the real timekeeper. * * We could avoid this memcpy() by switching pointers, but that has * the downside that the reader side does not longer benefit from * the cacheline optimized data layout of the timekeeper and requires * another indirection. */ memcpy(&tkd->timekeeper, tk, sizeof(*tk)); write_seqcount_end(&tkd->seq); } /** * timekeeping_forward_now - update clock to the current time * @tk: Pointer to the timekeeper to update * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; while (delta > 0) { u64 max = tk->tkr_mono.clock->max_cycles; u64 incr = delta < max ? delta : max; tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult; tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult; tk_normalize_xtime(tk); delta -= incr; } } /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec64 (WARN if suspended). */ void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); u32 ktime_get_resolution_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u32 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return nsecs; } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); /** * ktime_mono_to_any() - convert monotonic time to any other time * @tmono: time to convert. * @offs: which offset to use */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; unsigned int seq; ktime_t tconv; if (IS_ENABLED(CONFIG_64BIT)) { /* * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and * tk_update_sleep_time(). */ return ktime_add(tmono, READ_ONCE(*offset)); } do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); } while (read_seqcount_retry(&tk_core.seq, seq)); return tconv; } EXPORT_SYMBOL_GPL(ktime_mono_to_any); /** * ktime_get_raw - Returns the raw monotonic time in ktime_t format */ ktime_t ktime_get_raw(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_raw.base; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_raw); /** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result * in normalized timespec64 format in the variable pointed to by @ts. */ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; unsigned int seq; u64 nsec; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; timespec64_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts64); /** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non * serialized read. tk->ktime_sec is of type 'unsigned long' so this * works on both 32 and 64 bit systems. On 32 bit systems the readout * covers ~136 years of uptime which should be enough to prevent * premature wrap arounds. */ time64_t ktime_get_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; WARN_ON(timekeeping_suspended); return tk->ktime_sec; } EXPORT_SYMBOL_GPL(ktime_get_seconds); /** * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * * Returns the wall clock seconds since 1970. * * For 64bit systems the fast access to tk->xtime_sec is preserved. On * 32bit systems the access must be protected with the sequence * counter to provide "atomic" access to the 64bit tk->xtime_sec * value. */ time64_t ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; time64_t seconds; unsigned int seq; if (IS_ENABLED(CONFIG_64BIT)) return tk->xtime_sec; do { seq = read_seqcount_begin(&tk_core.seq); seconds = tk->xtime_sec; } while (read_seqcount_retry(&tk_core.seq, seq)); return seconds; } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** * __ktime_get_real_seconds - The same as ktime_get_real_seconds * but without the sequence counter protect. This internal function * is called just when timekeeping lock is already held. */ noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } /** * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter * @systime_snapshot: pointer to struct receiving the system time snapshot */ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base_raw; ktime_t base_real; ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; WARN_ON_ONCE(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_id = tk->tkr_mono.clock->id; systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_boot = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); } while (read_seqcount_retry(&tk_core.seq, seq)); systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); /* Scale base by mult/div checking for overflow */ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) { u64 tmp, rem; tmp = div64_u64_rem(*base, div, &rem); if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } /** * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval * @history: Snapshot representing start of history * @partial_history_cycles: Cycle offset into history (fractional part) * @total_history_cycles: Total history length in cycles * @discontinuity: True indicates clock was set on history period * @ts: Cross timestamp that should be adjusted using * partial/total ratio * * Helper function used by get_device_system_crosststamp() to correct the * crosstimestamp corresponding to the start of the current interval to the * system counter value (timestamp point) provided by the driver. The * total_history_* quantities are the total history starting at the provided * reference point and ending at the start of the current interval. The cycle * count between the driver timestamp point and the start of the current * interval is partial_history_cycles. */ static int adjust_historical_crosststamp(struct system_time_snapshot *history, u64 partial_history_cycles, u64 total_history_cycles, bool discontinuity, struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 corr_raw, corr_real; bool interp_forward; int ret; if (total_history_cycles == 0 || partial_history_cycles == 0) return 0; /* Interpolate shortest distance from beginning or end of history */ interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; /* * Scale the monotonic raw time delta by: * partial_history_cycles / total_history_cycles */ corr_raw = (u64)ktime_to_ns( ktime_sub(ts->sys_monoraw, history->raw)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_raw); if (ret) return ret; /* * If there is a discontinuity in the history, scale monotonic raw * correction by: * mult(real)/mult(raw) yielding the realtime correction * Otherwise, calculate the realtime correction similar to monotonic * raw calculation */ if (discontinuity) { corr_real = mul_u64_u32_div (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); } else { corr_real = (u64)ktime_to_ns( ktime_sub(ts->sys_realtime, history->real)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_real); if (ret) return ret; } /* Fixup monotonic raw and real time time values */ if (interp_forward) { ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); ts->sys_realtime = ktime_add_ns(history->real, corr_real); } else { ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); } return 0; } /* * timestamp_in_interval - true if ts is chronologically in [start, end] * * True if ts occurs chronologically at or after start, and before or at end. */ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) { if (ts >= start && ts <= end) return true; if (start > end && (ts >= start || ts <= end)) return true; return false; } static bool convert_clock(u64 *val, u32 numerator, u32 denominator) { u64 rem, res; if (!numerator || !denominator) return false; res = div64_u64_rem(*val, denominator, &rem) * numerator; *val = res + div_u64(rem * numerator, denominator); return true; } static bool convert_base_to_cs(struct system_counterval_t *scv) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; u32 num, den; /* The timestamp was taken from the time keeper clock source */ if (cs->id == scv->cs_id) return true; /* * Check whether cs_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != scv->cs_id) return false; num = scv->use_nsecs ? cs->freq_khz : base->numerator; den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; if (!convert_clock(&scv->cycles, num, den)) return false; scv->cycles += base->offset; return true; } static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; /* * Check whether base_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != base_id) return false; *cycles -= base->offset; if (!convert_clock(cycles, base->denominator, base->numerator)) return false; return true; } static bool convert_ns_to_cs(u64 *delta) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) return false; *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); return true; } /** * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp * @treal: CLOCK_REALTIME timestamp to convert * @base_id: base clocksource id * @cycles: pointer to store the converted base clock timestamp * * Converts a supplied, future realtime clock value to the corresponding base clock value. * * Return: true if the conversion is successful, false otherwise. */ bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 delta; do { seq = read_seqcount_begin(&tk_core.seq); if ((u64)treal < tk->tkr_mono.base_real) return false; delta = (u64)treal - tk->tkr_mono.base_real; if (!convert_ns_to_cs(&delta)) return false; *cycles = tk->tkr_mono.cycle_last + delta; if (!convert_cs_to_base(cycles, base_id)) return false; } while (read_seqcount_retry(&tk_core.seq, seq)); return true; } EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver * @ctx: Context passed to get_time_fn() * @history_begin: Historical reference point used to interpolate system * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time */ int get_device_system_crosststamp(int (*get_time_fn) (ktime_t *device_time, struct system_counterval_t *sys_counterval, void *ctx), void *ctx, struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned int seq; bool do_interp; int ret; do { seq = read_seqcount_begin(&tk_core.seq); /* * Try to synchronously capture device time and a system * counter value calling back into the device driver */ ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); if (ret) return ret; /* * Verify that the clocksource ID associated with the captured * system counter value is the same as for the currently * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; /* * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!timestamp_in_interval(interval_start, now, cycles)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; do_interp = true; } else { do_interp = false; } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); /* * Interpolate if necessary, adjusting back from the start of the * current interval */ if (do_interp) { u64 partial_history_cycles, total_history_cycles; bool discontinuity; /* * Check that the counter value is not before the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || !timestamp_in_interval(history_begin->cycles, cycles, system_counterval.cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; total_history_cycles = cycles - history_begin->cycles; discontinuity = history_begin->clock_was_set_seq != clock_was_set_seq; ret = adjust_historical_crosststamp(history_begin, partial_history_cycles, total_history_cycles, discontinuity, xtstamp); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * timekeeping_clocksource_has_base - Check whether the current clocksource * is based on given a base clock * @id: base clocksource ID * * Note: The return value is a snapshot which can become invalid right * after the function returns. * * Return: true if the timekeeper clocksource has a base clock with @id, * false otherwise */ bool timekeeping_clocksource_has_base(enum clocksource_ids id) { /* * This is a snapshot, so no point in using the sequence * count. Just prevent the compiler from re-evaluating @base as the * clocksource might change concurrently. */ struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); return base ? base->id == id : false; } EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); /** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ int do_settimeofday64(const struct timespec64 *ts) { struct timespec64 ts_delta, xt; if (!timespec64_valid_settod(ts)) return -EINVAL; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); xt = tk_xtime(tks); ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { timekeeping_restore_shadow(&tk_core); return -EINVAL; } tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); tk_set_xtime(tks, ts); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); audit_tk_injoffset(ts_delta); add_device_randomness(ts, sizeof(*ts)); return 0; } EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ static int timekeeping_inject_offset(const struct timespec64 *ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct timespec64 tmp; timekeeping_forward_now(tks); /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { timekeeping_restore_shadow(&tk_core); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); return 0; } /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. */ int persistent_clock_is_local; /* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. * * This is ugly, but preferable to the alternatives. Otherwise we * would either need to write a program to do it in /etc/rc (and risk * confusion if the program gets run more than once; it would also be * hard to make the program warp the clock precisely n hours) or * compile in the timezone information into the kernel. Bad, bad.... * * - TYT, 1992-01-01 * * The best thing to do is to keep the CMOS clock in universal time (UTC) * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; adjust.tv_nsec = 0; timekeeping_inject_offset(&adjust); } } /* * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /* * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource */ static int change_clocksource(void *data) { struct clocksource *new = data, *old = NULL; /* * If the clocksource is in a module, get a module reference. * Succeeds for built-in code (owner == NULL) as well. Abort if the * reference can't be acquired. */ if (!try_module_get(new->owner)) return 0; /* Abort if the device can't be enabled */ if (new->enable && new->enable(new) != 0) { module_put(new->owner); return 0; } scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); old = tks->tkr_mono.clock; tk_setup_internals(tks, new); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } if (old) { if (old->disable) old->disable(old); module_put(old->owner); } return 0; } /** * timekeeping_notify - Install a new clock source * @clock: pointer to the clock source * * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } /** * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_raw_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; int ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred */ u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * read_persistent_clock64 - Return time from the persistent clock. * @ts: Pointer to the storage for the readout value * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ void __weak read_persistent_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. * @wall_time: current time as returned by persistent clock * @boot_offset: offset that is defined as wall_time - boot_time * * Weak dummy function for arches that do not yet support it. * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't * support dedicated boot time clock will provide the best estimate of the * boot time. */ void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); *boot_offset = ns_to_timespec64(local_clock()); } static __init void tkd_basic_setup(struct tk_data *tkd) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); } /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * * The flag starts of false and is only set when a suspend reaches * timekeeping_suspend(), timekeeping_resume() sets it to false when the * timekeeper clocksource is not stopping across suspend and has been * used to update sleep time. If the timekeeper clocksource has stopped * then the flag stays true and is used by the RTC resume code to decide * whether sleeptime must be injected and if so the flag gets false then. * * If a suspend fails before reaching timekeeping_resume() then the flag * stays false and prevents erroneous sleeptime injection. */ static bool suspend_timing_needed; /* Flag for if there is a persistent clock on this platform */ static bool persistent_clock_exists; /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; tkd_basic_setup(&tk_core); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { pr_warn("Persistent clock returned invalid value"); wall_time = (struct timespec64){0}; } if (timespec64_compare(&wall_time, &boot_offset) < 0) boot_offset = (struct timespec64){0}; /* * We want set wall_to_mono, so the following is true: * wall time + wall_to_mono = boot time */ wall_to_mono = timespec64_sub(boot_offset, wall_time); guard(raw_spinlock_irqsave)(&tk_core.lock); ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); tk_setup_internals(tks, clock); tk_set_xtime(tks, &wall_time); tks->raw_sec = 0; tk_set_wall_to_mono(tks, wall_to_mono); timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @tk: Pointer to the timekeeper to be updated * @delta: Pointer to the delta value in timespec64 format * * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, const struct timespec64 *delta) { if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) /* * We have three kinds of time sources to use for sleep time * injection, the preference order is: * 1) non-stop clocksource * 2) persistent clock (ie: RTC accessible when irqs are off) * 3) RTC * * 1) and 2) are used by timekeeping, 3) by RTC subsystem. * If system has neither 1) nor 2), 3) will be used finally. * * * If timekeeping has injected sleeptime via either 1) or 2), * 3) becomes needless, so in this case we don't need to call * rtc_resume(), and this is what timekeeping_rtc_skipresume() * means. */ bool timekeeping_rtc_skipresume(void) { return !suspend_timing_needed; } /* * 1) can be determined whether to use or not only when doing * timekeeping_resume() which is invoked after rtc_suspend(), * so we can't skip rtc_suspend() surely if system has 1). * * But if system has 2), 2) will definitely be used, so in this * case we don't need to call rtc_suspend(), and this is what * timekeeping_rtc_skipsuspend() means. */ bool timekeeping_rtc_skipsuspend(void) { return persistent_clock_exists; } /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; suspend_timing_needed = false; timekeeping_forward_now(tks); __timekeeping_inject_sleeptime(tks, delta); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. */ void timekeeping_resume(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; bool inject_sleeptime = false; u64 cycle_now, nsec; unsigned long flags; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&tk_core.lock, flags); /* * After system resumes, we need to calculate the suspended time and * compensate it for the OS time. There are 3 sources that could be * used: Nonstop clocksource during suspend, persistent clock and rtc * device. * * One specific platform may have 1 or 2 or all of them, and the * preference will be: * suspend-nonstop clocksource -> persistent clock -> rtc * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); inject_sleeptime = true; } if (inject_sleeptime) { suspend_timing_needed = false; __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ tks->tkr_mono.cycle_last = cycle_now; tks->tkr_raw.cycle_last = cycle_now; tks->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); /* Resume the clockevent device(s) and hrtimers */ tick_resume(); /* Notify timerfd as resume is equivalent to clock_was_set() */ timerfd_resume(); } int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; struct clocksource *curr_clock; unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at * timekeeping_init by its return value, so if we see a valid * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) persistent_clock_exists = true; suspend_timing_needed = true; raw_spin_lock_irqsave(&tk_core.lock, flags); timekeeping_forward_now(tks); timekeeping_suspended = 1; /* * Since we've called forward_now, cycle_last stores the value * just read from the current clocksource. Save this to potentially * use in suspend timing. */ curr_clock = tks->tkr_mono.clock; cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { /* * To avoid drift caused by repeated suspend/resumes, * which each can add ~1 second drift error, * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction * has occurred and set old_delta to the current delta. */ old_delta = delta; } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = timespec64_add(timekeeping_suspend_time, delta_delta); } } timekeeping_update_from_shadow(&tk_core, 0); halt_fast_timekeeper(tks); raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); clockevents_suspend(); return 0; } /* sysfs resume/suspend bits for timekeeping */ static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; static int __init timekeeping_init_ops(void) { register_syscore_ops(&timekeeping_syscore_ops); return 0; } device_initcall(timekeeping_init_ops); /* * Apply a multiplier adjustment to the timekeeper */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, s32 mult_adj) { s64 interval = tk->cycle_interval; if (mult_adj == 0) { return; } else if (mult_adj == -1) { interval = -interval; offset = -offset; } else if (mult_adj != 1) { interval *= mult_adj; offset *= mult_adj; } /* * So the following can be confusing. * * To keep things simple, lets assume mult_adj == 1 for now. * * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier * by one, this causes the xtime_interval to be incremented by * one cycle_interval. This is because: * xtime_interval = cycle_interval * mult * So if mult is being incremented by one: * xtime_interval = cycle_interval * (mult + 1) * Its the same as: * xtime_interval = (cycle_interval * mult) + cycle_interval * Which can be shortened to: * xtime_interval += cycle_interval * * So offset stores the non-accumulated cycles. Thus the current * time (in shifted nanoseconds) is: * now = (offset * adj) + xtime_nsec * Now, even though we're adjusting the clock frequency, we have * to keep time consistent. In other words, we can't jump back * in time, and we also want to avoid jumping forward in time. * * So given the same offset value, we need the time to be the same * both before and after the freq adjustment. * now = (offset * adj_1) + xtime_nsec_1 * now = (offset * adj_2) + xtime_nsec_2 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_2) + xtime_nsec_2 * And we know: * adj_2 = adj_1 + 1 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * (adj_1+1)) + xtime_nsec_2 * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_1) + offset + xtime_nsec_2 * Canceling the sides: * xtime_nsec_1 = offset + xtime_nsec_2 * Which gives us: * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplifies to: * xtime_nsec -= offset */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ WARN_ON_ONCE(1); return; } tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; } /* * Adjust the timekeeper's multiplier to the correct frequency * and also to reduce the accumulated error value. */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { u64 ntp_tl = ntp_tick_length(); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } /* * If the clock is behind the NTP time, increase the multiplier by 1 * to catch up with it. If it's ahead and there was a remainder in the * tick division, the clock will slow down. Otherwise it will stay * ahead until the tick length changes to a non-divisible value. */ tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; mult += tk->ntp_err_mult; timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) > tk->tkr_mono.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); } /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * * Now, since we have already accumulated the second and the NTP * subsystem has been notified via second_overflow(), we need to skip * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec--; tk->skip_second_overflow = 1; } } /* * accumulate_nsecs_to_secs - Accumulates nsecs into secs * * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* * Skip NTP update if this second was accumulated before, * i.e. xtime_nsec underflowed in timekeeping_adjust() */ if (unlikely(tk->skip_second_overflow)) { tk->skip_second_overflow = 0; continue; } /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_set = TK_CLOCK_WAS_SET; } } return clock_set; } /* * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. */ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; /* Accumulate one shifted interval */ offset -= interval; tk->tkr_mono.cycle_last += interval; tk->tkr_raw.cycle_last += interval; tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; tk->raw_sec++; } /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset; guard(raw_spinlock_irqsave)(&tk_core.lock); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, * we calculate the largest doubling multiple of cycle_intervals * that is smaller than the offset. We then accumulate that * chunk in one go, and then try to consume the next smaller * doubled multiple. */ shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ clock_set |= accumulate_nsecs_to_secs(tk); timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; } /** * update_wall_time - Uses the current clocksource to increment the wall time * */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); } /** * getboottime64 - Return the real time of system boot. * @ts: pointer to the timespec64 to be set * * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); /** * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor * @ts: timespec64 to be filled * * Fetch the global mg_floor value, convert it to realtime and compare it * to the current coarse-grained time. Fill @ts with whichever is * latest. Note that this is a filesystem-specific interface and should be * avoided outside of that context. */ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 floor = atomic64_read(&mg_floor); ktime_t f_real, offset, coarse; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); coarse = timespec64_to_ktime(*ts); f_real = ktime_add(floor, offset); if (ktime_after(f_real, coarse)) *ts = ktime_to_timespec64(f_real); } /** * ktime_get_real_ts64_mg - attempt to update floor value and return result * @ts: pointer to the timespec to be set * * Get a monotonic fine-grained time value and attempt to swap it into * mg_floor. If that succeeds then accept the new floor value. If it fails * then another task raced in during the interim time and updated the * floor. Since any update to the floor must be later than the previous * floor, either outcome is acceptable. * * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), * and determining that the resulting coarse-grained timestamp did not effect * a change in ctime. Any more recent floor value would effect a change to * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. * * @ts will be filled with the latest floor value, regardless of the outcome of * the cmpxchg. Note that this is a filesystem specific interface and should be * avoided outside of that context. */ void ktime_get_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t old = atomic64_read(&mg_floor); ktime_t offset, mono; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; mono = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); mono = ktime_add_ns(mono, nsecs); /* * Attempt to update the floor with the new time value. As any * update must be later then the existing floor, and would effect * a change to ctime from the perspective of the current task, * accept the resulting floor value regardless of the outcome of * the swap. */ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); timekeeping_inc_mg_floor_swaps(); } else { /* * Another task changed mg_floor since "old" was fetched. * "old" has been updated with the latest value of "mg_floor". * That value is newer than the previous floor value, which * is enough to effect a change to ctime. Accept it. */ *ts = ktime_to_timespec64(ktime_add(old, offset)); } } void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { jiffies_64 += ticks; calc_global_load(); } /** * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the * sequence number in @cwsseq and timekeeper.clock_was_set_seq are * different. * * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); base = ktime_add_ns(base, nsecs); if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } /* Handle leapsecond insertion adjustments */ if (unlikely(base >= tk->next_leap_ktime)) *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); } while (read_seqcount_retry(&tk_core.seq, seq)); return base; } /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ static int timekeeping_validate_timex(const struct __kernel_timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; if (!(txc->modes & ADJ_OFFSET_READONLY) && !capable(CAP_SYS_TIME)) return -EPERM; } else { /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; /* * if the quartz is off by more than 10% then * something is VERY wrong! */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; } if (txc->modes & ADJ_SETOFFSET) { /* In order to inject time, you gotta be super-user! */ if (!capable(CAP_SYS_TIME)) return -EPERM; /* * Validate if a timespec/timeval used to inject a time * offset is valid. Offsets can be positive or negative, so * we don't check tv_sec. The value of the timeval/timespec * is the sum of its fields,but *NOTE*: * The field tv_usec/tv_nsec must always be non-negative and * we can't have more nanoseconds/microseconds than a second. */ if (txc->time.tv_usec < 0) return -EINVAL; if (txc->modes & ADJ_NANO) { if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; } else { if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } /* * Check for potential multiplication overflows that can * only happen on 64-bit systems: */ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } return 0; } /** * random_get_entropy_fallback - Returns the raw clock source value, * used by random.c for platforms with no valid random_get_entropy(). */ unsigned long random_get_entropy_fallback(void) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; struct clocksource *clock = READ_ONCE(tkr->clock); if (unlikely(timekeeping_suspended || !clock)) return 0; return clock->read(clock); } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { struct audit_ntp_data ad; bool offset_set = false; bool clock_set = false; struct timespec64 ts; int ret; /* Validate the data before disabling interrupts */ ret = timekeeping_validate_timex(txc); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); if (txc->modes & ADJ_SETOFFSET) { struct timespec64 delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; ret = timekeeping_inject_offset(&delta); if (ret) return ret; offset_set = delta.tv_sec != 0; audit_tk_injoffset(delta); } audit_ntp_init(&ad); ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; s32 orig_tai, tai; orig_tai = tai = tks->tai_offset; ret = __do_adjtimex(txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); clock_set = true; } else { tk_update_leap_state_all(&tk_core); } } audit_ntp_log(&ad); /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(offset_set); return ret; } #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function * @phase_ts: Pointer to timespec64 structure representing phase timestamp * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */
7 7 10 303 303 183 1 9241 2554 6864 249 1769 17 1560 21 67 349 12 492 2 5 19 954 1 718 18 719 1373 10 31 293 953 1 962 961 4 963 581 9 592 24 13 11 54 20 9 31 381 678 281 278 304 41 324 193 1137 4 11158 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SIGNAL_H #define _LINUX_SCHED_SIGNAL_H #include <linux/rculist.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> #include <linux/pid.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { spinlock_t siglock; refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; }; /* * Per-process accounting stats: */ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; u64 ac_utime, ac_stime; unsigned long ac_minflt, ac_majflt; }; struct cpu_itimer { u64 expires; u64 incr; }; /* * This is the atomic variant of task_cputime, which can be used for * storing and updating task_cputime statistics without locking. */ struct task_cputime_atomic { atomic64_t utime; atomic64_t stime; atomic64_t sum_exec_runtime; }; #define INIT_CPUTIME_ATOMIC \ (struct task_cputime_atomic) { \ .utime = ATOMIC64_INIT(0), \ .stime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \ } /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; }; struct multiprocess_signals { sigset_t signal; struct hlist_node node; }; struct core_thread { struct task_struct *task; struct core_thread *next; }; struct core_state { atomic_t nr_threads; struct core_thread dumper; struct completion startup; }; /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of * the locking of signal_struct. */ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; int quick_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ /* current thread group signal load-balancing target: */ struct task_struct *curr_target; /* shared signal handling: */ struct sigpending shared_pending; /* For collecting multiprocess signals during fork */ struct hlist_head multiprocess; /* thread group exit support */ int group_exit_code; /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ struct core_state *core_state; /* coredumping support */ /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes * to this process instead of 'init'. The service manager is * able to receive SIGCHLD signals and is able to investigate * the process until it calls wait(). All children of this * process will inherit a flag if they should look for a * child_subreaper process at exit. */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ unsigned int next_posix_timer_id; struct hlist_head posix_timers; struct hlist_head ignored_posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; ktime_t it_real_incr; /* * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these * values are defined to 0 and 1 respectively */ struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. * See thread_group_cputimer(), et al, for details. */ struct thread_group_cputimer cputimer; #endif /* Empty if CONFIG_POSIX_TIMERS=n */ struct posix_cputimers posix_cputimers; /* PID/PID hash table linkage. */ struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif struct pid *tty_old_pgrp; /* boolean value for session group leader */ int leader; struct tty_struct *tty; /* NULL if no tty */ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ seqlock_t stats_lock; u64 utime, stime, cutime, cstime; u64 gtime; u64 cgtime; struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* * Cumulative ns of schedule CPU time fo dead threads in the * group, not including a zombie group leader, (This only differs * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one * alone is a single word that can safely be read normally. * getrlimit/setrlimit use task_lock(current->group_leader) to * protect this instead of the siglock, because they really * have no need to disable irqs. */ struct rlimit rlim[RLIM_NLIMITS]; #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif #ifdef CONFIG_TASKSTATS struct taskstats *stats; #endif #ifdef CONFIG_AUDIT unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif /* * Thread is the potential origin of an oom condition; kill first on * oom */ bool oom_flag_origin; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) * Deprecated do not use in new code. * Use exec_update_lock instead. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent * permissions. */ } __randomize_layout; /* * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ /* * Pending notifications to parent. */ #define SIGNAL_CLD_STOPPED 0x00000010 #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ #define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ SIGNAL_STOP_CONTINUED) static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); ret = dequeue_signal(&task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; } static inline void kernel_signal_stop(void) { spin_lock_irq(&current->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) { current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); } spin_unlock_irq(&current->sighand->siglock); schedule(); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr); int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t); int force_sig_seccomp(int syscall, int reason, bool force_coredump); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); extern int force_sig_info(struct kernel_siginfo *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) { clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); } /* * Returns 'true' if kick_process() is needed to force a transition from * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. */ static inline bool __set_notify_signal(struct task_struct *task) { return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE); } /* * Called to break out of interruptible wait loops, and enter the * exit_to_user_mode_loop(). */ static inline void set_notify_signal(struct task_struct *task) { if (__set_notify_signal(task)) kick_process(task); } static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } static inline int signal_pending(struct task_struct *p) { /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops * so that notify signal callbacks can be processed. */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; return task_sigpending(p); } static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); } static inline int fatal_signal_pending(struct task_struct *p) { return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(unsigned int state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; if (!signal_pending(p)) return 0; return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } /* * This should only be used in fault handlers to decide whether we * should stop the current fault routine to handle the signals * instead, especially with the case where we've got interrupted with * a VM_FAULT_RETRY. */ static inline bool fault_signal_pending(vm_fault_t fault_flags, struct pt_regs *regs) { return unlikely((fault_flags & VM_FAULT_RETRY) && (fatal_signal_pending(current) || (user_mode(regs) && signal_pending(current)))); } /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { unsigned int state = 0; if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); state = TASK_WAKEKILL | __TASK_TRACED; } signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { unsigned int state = 0; if (resume) { t->jobctl &= ~JOBCTL_TRACED; state = __TASK_TRACED; } signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on * SMP architectures because they require atomic operations. */ /** * set_restore_sigmask() - make sure saved_sigmask processing gets done * * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code * will run before returning to user mode, to process the flag. For * all callers, TIF_SIGPENDING is already set or it's no harm to set * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the * arch code will notice on return to user mode, in case those bits * are scarce. We set TIF_SIGPENDING here to ensure that the arch * signal code always gets run when TIF_RESTORE_SIGMASK is set. */ static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_and_clear_restore_sigmask(void) { return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); } #else /* TIF_RESTORE_SIGMASK */ /* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ static inline void set_restore_sigmask(void) { current->restore_sigmask = true; } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; } static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) return false; current->restore_sigmask = false; return true; } #endif static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) __set_current_blocked(&current->saved_sigmask); } extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } static inline sigset_t *sigmask_to_save(void) { sigset_t *res = &current->blocked; if (unlikely(test_restore_sigmask())) res = &current->saved_sigmask; return res; } static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); } /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) static inline int __on_sig_stack(unsigned long sp) { #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; #else return sp > current->sas_ss_sp && sp - current->sas_ss_sp <= current->sas_ss_size; #endif } /* * True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) { /* * If the signal stack is SS_AUTODISARM then, by construction, we * can't be on the signal stack unless user code deliberately set * SS_AUTODISARM when we were already on it. * * This improves reliability: if user state gets corrupted such that * the stack pointer points very close to the end of the signal stack, * then this check will enable the signal to be handled anyway. */ if (current->sas_ss_flags & SS_AUTODISARM) return 0; return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) { if (!current->sas_ss_size) return SS_DISABLE; return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline void sas_ss_reset(struct task_struct *p) { p->sas_ss_sp = 0; p->sas_ss_size = 0; p->sas_ss_flags = SS_DISABLE; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) #ifdef CONFIG_STACK_GROWSUP return current->sas_ss_sp; #else return current->sas_ss_sp + current->sas_ss_size; #endif return sp; } extern void __cleanup_sighand(struct sighand_struct *); extern void flush_itimer_signals(void); #define tasklist_empty() \ list_empty(&init_task.tasks) #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) extern bool current_is_single_threaded(void); /* * Without tasklist/siglock it is only rcu-safe if g can't exit/exec, * otherwise next_thread(t) will never reach g after list_del_rcu(g). */ #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) #define for_other_threads(p, t) \ for (t = p; (t = next_thread(t)) != p; ) #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) /* Careful: this is a double loop, 'break' won't work as expected. */ #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { struct pid *pid; if (type == PIDTYPE_PID) pid = task_pid(task); else pid = task->signal->pids[type]; return pid; } static inline struct pid *task_tgid(struct task_struct *task) { return task->signal->pids[PIDTYPE_TGID]; } /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, * we can race with another thread doing sys_setsid/sys_setpgid. */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->signal->pids[PIDTYPE_PGID]; } static inline struct pid *task_session(struct task_struct *task) { return task->signal->pids[PIDTYPE_SID]; } static inline int get_nr_threads(struct task_struct *task) { return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) { return p->exit_signal >= 0; } static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { return p1->signal == p2->signal; } /* * returns NULL if p is the last thread in the thread group */ static inline struct task_struct *__next_thread(struct task_struct *p) { return list_next_or_null_rcu(&p->signal->thread_head, &p->thread_node, struct task_struct, thread_node); } static inline struct task_struct *next_thread(struct task_struct *p) { return __next_thread(p) ?: p->group_leader; } static inline int thread_group_empty(struct task_struct *p) { return thread_group_leader(p) && list_is_last(&p->thread_node, &p->signal->thread_head); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; ret = __lock_task_sighand(task, flags); (void)__cond_lock(&task->sighand->siglock, ret); return ret; } static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { spin_unlock_irqrestore(&task->sighand->siglock, *flags); } #ifdef CONFIG_LOCKDEP extern void lockdep_assert_task_sighand_held(struct task_struct *task); #else static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } #endif static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) { return task_rlimit(current, limit); } static inline unsigned long rlimit_max(unsigned int limit) { return task_rlimit_max(current, limit); } #endif /* _LINUX_SCHED_SIGNAL_H */
17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/clnt.h * * Declarations for the high-level RPC client interface * * Copyright (C) 1995, 1996, Olaf Kirch <okir@monad.swb.de> */ #ifndef _LINUX_SUNRPC_CLNT_H #define _LINUX_SUNRPC_CLNT_H #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/refcount.h> #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/timer.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <asm/signal.h> #include <linux/path.h> #include <net/ipv6.h> #include <linux/sunrpc/xprtmultipath.h> struct rpc_inode; struct rpc_sysfs_client { struct kobject kobject; struct net *net; struct rpc_clnt *clnt; struct rpc_xprt_switch *xprt_switch; }; /* * The high-level client handle */ struct rpc_clnt { refcount_t cl_count; /* Number of references */ unsigned int cl_clid; /* client id */ struct list_head cl_clients; /* Global list of clients */ struct list_head cl_tasks; /* List of tasks */ atomic_t cl_pid; /* task PID counter */ spinlock_t cl_lock; /* spinlock */ struct rpc_xprt __rcu * cl_xprt; /* transport */ const struct rpc_procinfo *cl_procinfo; /* procedure info */ u32 cl_prog, /* RPC program number */ cl_vers, /* RPC version number */ cl_maxproc; /* max procedure number */ struct rpc_auth * cl_auth; /* authenticator */ struct rpc_stat * cl_stats; /* per-program statistics */ struct rpc_iostats * cl_metrics; /* per-client statistics */ unsigned int cl_softrtry : 1,/* soft timeouts */ cl_softerr : 1,/* Timeouts return errors */ cl_discrtry : 1,/* disconnect before retry */ cl_noretranstimeo: 1,/* No retransmit timeouts */ cl_autobind : 1,/* use getport() */ cl_chatty : 1,/* be verbose */ cl_shutdown : 1;/* rpc immediate -EIO */ struct xprtsec_parms cl_xprtsec; /* transport security policy */ struct rpc_rtt * cl_rtt; /* RTO estimator data */ const struct rpc_timeout *cl_timeout; /* Timeout strategy */ atomic_t cl_swapper; /* swapfile count */ int cl_nodelen; /* nodename length */ char cl_nodename[UNX_MAXNODENAME+1]; struct rpc_pipe_dir_head cl_pipedir_objects; struct rpc_clnt * cl_parent; /* Points to parent of clones */ struct rpc_rtt cl_rtt_default; struct rpc_timeout cl_timeout_default; const struct rpc_program *cl_program; const char * cl_principal; /* use for machine cred */ #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct dentry *cl_debugfs; /* debugfs directory */ #endif struct rpc_sysfs_client *cl_sysfs; /* sysfs directory */ /* cl_work is only needed after cl_xpi is no longer used, * and that are of similar size */ union { struct rpc_xprt_iter cl_xpi; struct work_struct cl_work; }; const struct cred *cl_cred; unsigned int cl_max_connect; /* max number of transports not to the same IP */ struct super_block *pipefs_sb; atomic_t cl_task_count; }; /* * General RPC program info */ #define RPC_MAXVERSION 4 struct rpc_program { const char * name; /* protocol name */ u32 number; /* program number */ unsigned int nrvers; /* number of versions */ const struct rpc_version ** version; /* version array */ struct rpc_stat * stats; /* statistics */ const char * pipe_dir_name; /* path to rpc_pipefs dir */ }; struct rpc_version { u32 number; /* version number */ unsigned int nrprocs; /* number of procs */ const struct rpc_procinfo *procs; /* procedure array */ unsigned int *counts; /* call counts */ }; /* * Procedure information */ struct rpc_procinfo { u32 p_proc; /* RPC procedure number */ kxdreproc_t p_encode; /* XDR encode function */ kxdrdproc_t p_decode; /* XDR decode function */ unsigned int p_arglen; /* argument hdr length (u32) */ unsigned int p_replen; /* reply hdr length (u32) */ unsigned int p_timer; /* Which RTT timer to use */ u32 p_statidx; /* Which procedure to account */ const char * p_name; /* name of procedure */ }; struct rpc_create_args { struct net *net; int protocol; struct sockaddr *address; size_t addrsize; struct sockaddr *saddress; const struct rpc_timeout *timeout; const char *servername; const char *nodename; const struct rpc_program *program; struct rpc_stat *stats; u32 prognumber; /* overrides program->number */ u32 version; rpc_authflavor_t authflavor; u32 nconnect; unsigned long flags; char *client_name; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ const struct cred *cred; unsigned int max_connect; struct xprtsec_parms xprtsec; unsigned long connect_timeout; unsigned long reconnect_timeout; }; struct rpc_add_xprt_test { void (*add_xprt_test)(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *calldata); void *data; }; /* Values for "flags" field */ #define RPC_CLNT_CREATE_HARDRTRY (1UL << 0) #define RPC_CLNT_CREATE_AUTOBIND (1UL << 2) #define RPC_CLNT_CREATE_NONPRIVPORT (1UL << 3) #define RPC_CLNT_CREATE_NOPING (1UL << 4) #define RPC_CLNT_CREATE_DISCRTRY (1UL << 5) #define RPC_CLNT_CREATE_QUIET (1UL << 6) #define RPC_CLNT_CREATE_INFINITE_SLOTS (1UL << 7) #define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT (1UL << 8) #define RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT (1UL << 9) #define RPC_CLNT_CREATE_SOFTERR (1UL << 10) #define RPC_CLNT_CREATE_REUSEPORT (1UL << 11) #define RPC_CLNT_CREATE_CONNECTED (1UL << 12) struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, const struct rpc_program *, u32); struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *, rpc_authflavor_t); int rpc_switch_client_transport(struct rpc_clnt *, struct xprt_create *, const struct rpc_timeout *); void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); void rpc_task_release_transport(struct rpc_task *); void rpc_task_release_client(struct rpc_task *); struct rpc_xprt *rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt); int rpcb_create_local(struct net *); void rpcb_put_local(struct net *); int rpcb_register(struct net *, u32, u32, int, unsigned short); int rpcb_v4_register(struct net *net, const u32 program, const u32 version, const struct sockaddr *address, const char *netid); void rpcb_getport_async(struct rpc_task *); void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize); void rpc_call_start(struct rpc_task *); int rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags, const struct rpc_call_ops *tk_ops, void *calldata); int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags); struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags); int rpc_restart_call_prepare(struct rpc_task *); int rpc_restart_call(struct rpc_task *); void rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int); struct net * rpc_net_ns(struct rpc_clnt *); size_t rpc_max_payload(struct rpc_clnt *); size_t rpc_max_bc_payload(struct rpc_clnt *); unsigned int rpc_num_bc_slots(struct rpc_clnt *); void rpc_force_rebind(struct rpc_clnt *); size_t rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t); const char *rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t); int rpc_localaddr(struct rpc_clnt *, struct sockaddr *, size_t); int rpc_clnt_iterate_for_each_xprt(struct rpc_clnt *clnt, int (*fn)(struct rpc_clnt *, struct rpc_xprt *, void *), void *data); int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, void *dummy); int rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *, int (*setup)(struct rpc_clnt *, struct rpc_xprt_switch *, struct rpc_xprt *, void *), void *data); void rpc_set_connect_timeout(struct rpc_clnt *clnt, unsigned long connect_timeout, unsigned long reconnect_timeout); int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *, struct rpc_xprt_switch *, struct rpc_xprt *, void *); void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *); void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *, struct rpc_add_xprt_test *); const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt); void rpc_clnt_disconnect(struct rpc_clnt *clnt); void rpc_cleanup_clids(void); static inline int rpc_reply_expected(struct rpc_task *task) { return (task->tk_msg.rpc_proc != NULL) && (task->tk_msg.rpc_proc->p_decode != NULL); } static inline void rpc_task_close_connection(struct rpc_task *task) { if (task->tk_xprt) xprt_force_disconnect(task->tk_xprt); } #endif /* _LINUX_SUNRPC_CLNT_H */
2 3 2 2 3 3 2 2 3 1 1 1 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for AVX assembler versions of Serpent Cipher * * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> */ #include <linux/module.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" #include "ecb_cbc_helpers.h" /* 8-way parallel cipher functions */ asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx); asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx); asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx); static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) { ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); ECB_BLOCK(1, __serpent_encrypt); ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); ECB_BLOCK(1, __serpent_decrypt); ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); CBC_ENC_BLOCK(__serpent_encrypt); CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); CBC_DEC_BLOCK(1, __serpent_decrypt); CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", .base.cra_driver_name = "__ecb-serpent-avx", .base.cra_priority = 500, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "__cbc(serpent)", .base.cra_driver_name = "__cbc-serpent-avx", .base.cra_priority = 500, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .ivsize = SERPENT_BLOCK_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, }; static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; static int __init serpent_init(void) { const char *feature_name; if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } static void __exit serpent_exit(void) { simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } module_init(serpent_init); module_exit(serpent_exit); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("serpent");
8 8 8 8 8 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 /* * linux/fs/nls/nls_cp1250.c * * Charset cp1250 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x20ac, 0x0000, 0x201a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021, 0x0000, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, /* 0x90*/ 0x0000, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x0000, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, /* 0xa0*/ 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, /* 0xb0*/ 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, /* 0xc0*/ 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* 0xd0*/ 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* 0xe0*/ 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* 0xf0*/ 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ 0xb0, 0xb1, 0x00, 0x00, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0xc1, 0xc2, 0x00, 0xc4, 0x00, 0x00, 0xc7, /* 0xc0-0xc7 */ 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0xd3, 0xd4, 0x00, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0x00, 0x00, 0xda, 0x00, 0xdc, 0xdd, 0x00, 0xdf, /* 0xd8-0xdf */ 0x00, 0xe1, 0xe2, 0x00, 0xe4, 0x00, 0x00, 0xe7, /* 0xe0-0xe7 */ 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xf3, 0xf4, 0x00, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0x00, 0x00, 0xfa, 0x00, 0xfc, 0xfd, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0xc3, 0xe3, 0xa5, 0xb9, 0xc6, 0xe6, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0xcf, 0xef, /* 0x08-0x0f */ 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xca, 0xea, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0xc5, 0xe5, 0x00, 0x00, 0xbc, 0xbe, 0x00, /* 0x38-0x3f */ 0x00, 0xa3, 0xb3, 0xd1, 0xf1, 0x00, 0x00, 0xd2, /* 0x40-0x47 */ 0xf2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xd5, 0xf5, 0x00, 0x00, 0xc0, 0xe0, 0x00, 0x00, /* 0x50-0x57 */ 0xd8, 0xf8, 0x8c, 0x9c, 0x00, 0x00, 0xaa, 0xba, /* 0x58-0x5f */ 0x8a, 0x9a, 0xde, 0xfe, 0x8d, 0x9d, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd9, 0xf9, /* 0x68-0x6f */ 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x8f, 0x9f, 0xaf, 0xbf, 0x8e, 0x9e, 0x00, /* 0x78-0x7f */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa1, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0xa2, 0xff, 0x00, 0xb2, 0x00, 0xbd, 0x00, 0x00, /* 0xd8-0xdf */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x91, 0x92, 0x82, 0x00, 0x93, 0x94, 0x84, 0x00, /* 0x18-0x1f */ 0x86, 0x87, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x8b, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x00, 0x89, 0x9a, 0x8b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x88-0x8f */ 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x00, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xb3, 0xa4, 0xb9, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xba, 0xab, 0xac, 0xad, 0xae, 0xbf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbe, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x00, 0x82, 0x00, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x00, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x00, 0x99, 0x8a, 0x9b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xa3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xa5, 0xaa, 0xbb, 0xbc, 0xbd, 0xbc, 0xaf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0x00, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp1250", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp1250(void) { return register_nls(&table); } static void __exit exit_nls_cp1250(void) { unregister_nls(&table); } module_init(init_nls_cp1250) module_exit(exit_nls_cp1250) MODULE_DESCRIPTION("NLS Windows CP1250 (Slavic/Central European Languages)"); MODULE_LICENSE("Dual BSD/GPL");
1 2 3 3 5 4 3 3 3 1 1 2 3 3 2 2 2 2 1 5 2 1 1 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 /* BNEP implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2001-2002 Inventel Systemes Written 2001-2002 by Clément Moreau <clement.moreau@inventel.fr> David Libault <david.libault@inventel.fr> Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/module.h> #include <linux/kthread.h> #include <linux/file.h> #include <linux/etherdevice.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/hci_core.h> #include "bnep.h" #define VERSION "1.3" static bool compress_src = true; static bool compress_dst = true; static LIST_HEAD(bnep_session_list); static DECLARE_RWSEM(bnep_session_sem); static struct bnep_session *__bnep_get_session(u8 *dst) { struct bnep_session *s; BT_DBG(""); list_for_each_entry(s, &bnep_session_list, list) if (ether_addr_equal(dst, s->eh.h_source)) return s; return NULL; } static void __bnep_link_session(struct bnep_session *s) { list_add(&s->list, &bnep_session_list); } static void __bnep_unlink_session(struct bnep_session *s) { list_del(&s->list); } static int bnep_send(struct bnep_session *s, void *data, size_t len) { struct socket *sock = s->sock; struct kvec iv = { data, len }; return kernel_sendmsg(sock, &s->msg, &iv, 1, len); } static int bnep_send_rsp(struct bnep_session *s, u8 ctrl, u16 resp) { struct bnep_control_rsp rsp; rsp.type = BNEP_CONTROL; rsp.ctrl = ctrl; rsp.resp = htons(resp); return bnep_send(s, &rsp, sizeof(rsp)); } #ifdef CONFIG_BT_BNEP_PROTO_FILTER static inline void bnep_set_default_proto_filter(struct bnep_session *s) { /* (IPv4, ARP) */ s->proto_filter[0].start = ETH_P_IP; s->proto_filter[0].end = ETH_P_ARP; /* (RARP, AppleTalk) */ s->proto_filter[1].start = ETH_P_RARP; s->proto_filter[1].end = ETH_P_AARP; /* (IPX, IPv6) */ s->proto_filter[2].start = ETH_P_IPX; s->proto_filter[2].end = ETH_P_IPV6; } #endif static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len) { int n; if (len < 2) return -EILSEQ; n = get_unaligned_be16(data); data++; len -= 2; if (len < n) return -EILSEQ; BT_DBG("filter len %d", n); #ifdef CONFIG_BT_BNEP_PROTO_FILTER n /= 4; if (n <= BNEP_MAX_PROTO_FILTERS) { struct bnep_proto_filter *f = s->proto_filter; int i; for (i = 0; i < n; i++) { f[i].start = get_unaligned_be16(data++); f[i].end = get_unaligned_be16(data++); BT_DBG("proto filter start %u end %u", f[i].start, f[i].end); } if (i < BNEP_MAX_PROTO_FILTERS) memset(f + i, 0, sizeof(*f)); if (n == 0) bnep_set_default_proto_filter(s); bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_SUCCESS); } else { bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_LIMIT_REACHED); } #else bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_UNSUPPORTED_REQ); #endif return 0; } static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len) { int n; if (len < 2) return -EILSEQ; n = get_unaligned_be16(data); data += 2; len -= 2; if (len < n) return -EILSEQ; BT_DBG("filter len %d", n); #ifdef CONFIG_BT_BNEP_MC_FILTER n /= (ETH_ALEN * 2); if (n > 0) { int i; s->mc_filter = 0; /* Always send broadcast */ set_bit(bnep_mc_hash(s->dev->broadcast), (ulong *) &s->mc_filter); /* Add address ranges to the multicast hash */ for (; n > 0; n--) { u8 a1[6], *a2; memcpy(a1, data, ETH_ALEN); data += ETH_ALEN; a2 = data; data += ETH_ALEN; BT_DBG("mc filter %pMR -> %pMR", a1, a2); /* Iterate from a1 to a2 */ set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter); while (memcmp(a1, a2, 6) < 0 && s->mc_filter != ~0LL) { /* Increment a1 */ i = 5; while (i >= 0 && ++a1[i--] == 0) ; set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter); } } } BT_DBG("mc filter hash 0x%llx", s->mc_filter); bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_SUCCESS); #else bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_FILTER_UNSUPPORTED_REQ); #endif return 0; } static int bnep_rx_control(struct bnep_session *s, void *data, int len) { u8 cmd = *(u8 *)data; int err = 0; data++; len--; switch (cmd) { case BNEP_CMD_NOT_UNDERSTOOD: case BNEP_SETUP_CONN_RSP: case BNEP_FILTER_NET_TYPE_RSP: case BNEP_FILTER_MULTI_ADDR_RSP: /* Ignore these for now */ break; case BNEP_FILTER_NET_TYPE_SET: err = bnep_ctrl_set_netfilter(s, data, len); break; case BNEP_FILTER_MULTI_ADDR_SET: err = bnep_ctrl_set_mcfilter(s, data, len); break; case BNEP_SETUP_CONN_REQ: /* Successful response should be sent only once */ if (test_bit(BNEP_SETUP_RESPONSE, &s->flags) && !test_and_set_bit(BNEP_SETUP_RSP_SENT, &s->flags)) err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP, BNEP_SUCCESS); else err = bnep_send_rsp(s, BNEP_SETUP_CONN_RSP, BNEP_CONN_NOT_ALLOWED); break; default: { u8 pkt[3]; pkt[0] = BNEP_CONTROL; pkt[1] = BNEP_CMD_NOT_UNDERSTOOD; pkt[2] = cmd; err = bnep_send(s, pkt, sizeof(pkt)); } break; } return err; } static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb) { struct bnep_ext_hdr *h; int err = 0; do { h = (void *) skb->data; if (!skb_pull(skb, sizeof(*h))) { err = -EILSEQ; break; } BT_DBG("type 0x%x len %u", h->type, h->len); switch (h->type & BNEP_TYPE_MASK) { case BNEP_EXT_CONTROL: bnep_rx_control(s, skb->data, skb->len); break; default: /* Unknown extension, skip it. */ break; } if (!skb_pull(skb, h->len)) { err = -EILSEQ; break; } } while (!err && (h->type & BNEP_EXT_HEADER)); return err; } static u8 __bnep_rx_hlen[] = { ETH_HLEN, /* BNEP_GENERAL */ 0, /* BNEP_CONTROL */ 2, /* BNEP_COMPRESSED */ ETH_ALEN + 2, /* BNEP_COMPRESSED_SRC_ONLY */ ETH_ALEN + 2 /* BNEP_COMPRESSED_DST_ONLY */ }; static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) { struct net_device *dev = s->dev; struct sk_buff *nskb; u8 type, ctrl_type; dev->stats.rx_bytes += skb->len; type = *(u8 *) skb->data; skb_pull(skb, 1); ctrl_type = *(u8 *)skb->data; if ((type & BNEP_TYPE_MASK) >= sizeof(__bnep_rx_hlen)) goto badframe; if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) { if (bnep_rx_control(s, skb->data, skb->len) < 0) { dev->stats.tx_errors++; kfree_skb(skb); return 0; } if (!(type & BNEP_EXT_HEADER)) { kfree_skb(skb); return 0; } /* Verify and pull ctrl message since it's already processed */ switch (ctrl_type) { case BNEP_SETUP_CONN_REQ: /* Pull: ctrl type (1 b), len (1 b), data (len bytes) */ if (!skb_pull(skb, 2 + *(u8 *)(skb->data + 1) * 2)) goto badframe; break; case BNEP_FILTER_MULTI_ADDR_SET: case BNEP_FILTER_NET_TYPE_SET: /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */ if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2)) goto badframe; break; default: kfree_skb(skb); return 0; } } else { skb_reset_mac_header(skb); /* Verify and pull out header */ if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK])) goto badframe; s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2)); } if (type & BNEP_EXT_HEADER) { if (bnep_rx_extension(s, skb) < 0) goto badframe; } /* Strip 802.1p header */ if (ntohs(s->eh.h_proto) == ETH_P_8021Q) { if (!skb_pull(skb, 4)) goto badframe; s->eh.h_proto = get_unaligned((__be16 *) (skb->data - 2)); } /* We have to alloc new skb and copy data here :(. Because original skb * may not be modified and because of the alignment requirements. */ nskb = alloc_skb(2 + ETH_HLEN + skb->len, GFP_KERNEL); if (!nskb) { dev->stats.rx_dropped++; kfree_skb(skb); return -ENOMEM; } skb_reserve(nskb, 2); /* Decompress header and construct ether frame */ switch (type & BNEP_TYPE_MASK) { case BNEP_COMPRESSED: __skb_put_data(nskb, &s->eh, ETH_HLEN); break; case BNEP_COMPRESSED_SRC_ONLY: __skb_put_data(nskb, s->eh.h_dest, ETH_ALEN); __skb_put_data(nskb, skb_mac_header(skb), ETH_ALEN); put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2)); break; case BNEP_COMPRESSED_DST_ONLY: __skb_put_data(nskb, skb_mac_header(skb), ETH_ALEN); __skb_put_data(nskb, s->eh.h_source, ETH_ALEN); put_unaligned(s->eh.h_proto, (__be16 *)__skb_put(nskb, 2)); break; case BNEP_GENERAL: __skb_put_data(nskb, skb_mac_header(skb), ETH_ALEN * 2); put_unaligned(s->eh.h_proto, (__be16 *) __skb_put(nskb, 2)); break; } skb_copy_from_linear_data(skb, __skb_put(nskb, skb->len), skb->len); kfree_skb(skb); dev->stats.rx_packets++; nskb->ip_summed = CHECKSUM_NONE; nskb->protocol = eth_type_trans(nskb, dev); netif_rx(nskb); return 0; badframe: dev->stats.rx_errors++; kfree_skb(skb); return 0; } static u8 __bnep_tx_types[] = { BNEP_GENERAL, BNEP_COMPRESSED_SRC_ONLY, BNEP_COMPRESSED_DST_ONLY, BNEP_COMPRESSED }; static int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb) { struct ethhdr *eh = (void *) skb->data; struct socket *sock = s->sock; struct kvec iv[3]; int len = 0, il = 0; u8 type = 0; BT_DBG("skb %p dev %p type %u", skb, skb->dev, skb->pkt_type); if (!skb->dev) { /* Control frame sent by us */ goto send; } iv[il++] = (struct kvec) { &type, 1 }; len++; if (compress_src && ether_addr_equal(eh->h_dest, s->eh.h_source)) type |= 0x01; if (compress_dst && ether_addr_equal(eh->h_source, s->eh.h_dest)) type |= 0x02; if (type) skb_pull(skb, ETH_ALEN * 2); type = __bnep_tx_types[type]; switch (type) { case BNEP_COMPRESSED_SRC_ONLY: iv[il++] = (struct kvec) { eh->h_source, ETH_ALEN }; len += ETH_ALEN; break; case BNEP_COMPRESSED_DST_ONLY: iv[il++] = (struct kvec) { eh->h_dest, ETH_ALEN }; len += ETH_ALEN; break; } send: iv[il++] = (struct kvec) { skb->data, skb->len }; len += skb->len; /* FIXME: linearize skb */ { len = kernel_sendmsg(sock, &s->msg, iv, il, len); } kfree_skb(skb); if (len > 0) { s->dev->stats.tx_bytes += len; s->dev->stats.tx_packets++; return 0; } return len; } static int bnep_session(void *arg) { struct bnep_session *s = arg; struct net_device *dev = s->dev; struct sock *sk = s->sock->sk; struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); BT_DBG(""); set_user_nice(current, -15); add_wait_queue(sk_sleep(sk), &wait); while (1) { if (atomic_read(&s->terminate)) break; /* RX */ while ((skb = skb_dequeue(&sk->sk_receive_queue))) { skb_orphan(skb); if (!skb_linearize(skb)) bnep_rx_frame(s, skb); else kfree_skb(skb); } if (sk->sk_state != BT_CONNECTED) break; /* TX */ while ((skb = skb_dequeue(&sk->sk_write_queue))) if (bnep_tx_frame(s, skb)) break; netif_wake_queue(dev); /* * wait_woken() performs the necessary memory barriers * for us; see the header comment for this primitive. */ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(sk_sleep(sk), &wait); /* Cleanup session */ down_write(&bnep_session_sem); /* Delete network device */ unregister_netdev(dev); /* Wakeup user-space polling for socket errors */ s->sock->sk->sk_err = EUNATCH; wake_up_interruptible(sk_sleep(s->sock->sk)); /* Release the socket */ fput(s->sock->file); __bnep_unlink_session(s); up_write(&bnep_session_sem); free_netdev(dev); module_put_and_kthread_exit(0); return 0; } static struct device *bnep_get_device(struct bnep_session *session) { struct l2cap_conn *conn = l2cap_pi(session->sock->sk)->chan->conn; if (!conn || !conn->hcon) return NULL; return &conn->hcon->dev; } static const struct device_type bnep_type = { .name = "bluetooth", }; int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) { u32 valid_flags = BIT(BNEP_SETUP_RESPONSE); struct net_device *dev; struct bnep_session *s, *ss; u8 dst[ETH_ALEN], src[ETH_ALEN]; int err; BT_DBG(""); if (!l2cap_is_socket(sock)) return -EBADFD; if (req->flags & ~valid_flags) return -EINVAL; baswap((void *) dst, &l2cap_pi(sock->sk)->chan->dst); baswap((void *) src, &l2cap_pi(sock->sk)->chan->src); /* session struct allocated as private part of net_device */ dev = alloc_netdev(sizeof(struct bnep_session), (*req->device) ? req->device : "bnep%d", NET_NAME_UNKNOWN, bnep_net_setup); if (!dev) return -ENOMEM; down_write(&bnep_session_sem); ss = __bnep_get_session(dst); if (ss && ss->state == BT_CONNECTED) { err = -EEXIST; goto failed; } s = netdev_priv(dev); /* This is rx header therefore addresses are swapped. * ie. eh.h_dest is our local address. */ memcpy(s->eh.h_dest, &src, ETH_ALEN); memcpy(s->eh.h_source, &dst, ETH_ALEN); eth_hw_addr_set(dev, s->eh.h_dest); s->dev = dev; s->sock = sock; s->role = req->role; s->state = BT_CONNECTED; s->flags = req->flags; s->msg.msg_flags = MSG_NOSIGNAL; #ifdef CONFIG_BT_BNEP_MC_FILTER /* Set default mc filter to not filter out any mc addresses * as defined in the BNEP specification (revision 0.95a) * http://grouper.ieee.org/groups/802/15/Bluetooth/BNEP.pdf */ s->mc_filter = ~0LL; #endif #ifdef CONFIG_BT_BNEP_PROTO_FILTER /* Set default protocol filter */ bnep_set_default_proto_filter(s); #endif SET_NETDEV_DEV(dev, bnep_get_device(s)); SET_NETDEV_DEVTYPE(dev, &bnep_type); err = register_netdev(dev); if (err) goto failed; __bnep_link_session(s); __module_get(THIS_MODULE); s->task = kthread_run(bnep_session, s, "kbnepd %s", dev->name); if (IS_ERR(s->task)) { /* Session thread start failed, gotta cleanup. */ module_put(THIS_MODULE); unregister_netdev(dev); __bnep_unlink_session(s); err = PTR_ERR(s->task); goto failed; } up_write(&bnep_session_sem); strcpy(req->device, dev->name); return 0; failed: up_write(&bnep_session_sem); free_netdev(dev); return err; } int bnep_del_connection(struct bnep_conndel_req *req) { u32 valid_flags = 0; struct bnep_session *s; int err = 0; BT_DBG(""); if (req->flags & ~valid_flags) return -EINVAL; down_read(&bnep_session_sem); s = __bnep_get_session(req->dst); if (s) { atomic_inc(&s->terminate); wake_up_interruptible(sk_sleep(s->sock->sk)); } else err = -ENOENT; up_read(&bnep_session_sem); return err; } static void __bnep_copy_ci(struct bnep_conninfo *ci, struct bnep_session *s) { u32 valid_flags = BIT(BNEP_SETUP_RESPONSE); memset(ci, 0, sizeof(*ci)); memcpy(ci->dst, s->eh.h_source, ETH_ALEN); strcpy(ci->device, s->dev->name); ci->flags = s->flags & valid_flags; ci->state = s->state; ci->role = s->role; } int bnep_get_connlist(struct bnep_connlist_req *req) { struct bnep_session *s; int err = 0, n = 0; down_read(&bnep_session_sem); list_for_each_entry(s, &bnep_session_list, list) { struct bnep_conninfo ci; __bnep_copy_ci(&ci, s); if (copy_to_user(req->ci, &ci, sizeof(ci))) { err = -EFAULT; break; } if (++n >= req->cnum) break; req->ci++; } req->cnum = n; up_read(&bnep_session_sem); return err; } int bnep_get_conninfo(struct bnep_conninfo *ci) { struct bnep_session *s; int err = 0; down_read(&bnep_session_sem); s = __bnep_get_session(ci->dst); if (s) __bnep_copy_ci(ci, s); else err = -ENOENT; up_read(&bnep_session_sem); return err; } static int __init bnep_init(void) { char flt[50] = ""; #ifdef CONFIG_BT_BNEP_PROTO_FILTER strcat(flt, "protocol "); #endif #ifdef CONFIG_BT_BNEP_MC_FILTER strcat(flt, "multicast"); #endif BT_INFO("BNEP (Ethernet Emulation) ver %s", VERSION); if (flt[0]) BT_INFO("BNEP filters: %s", flt); return bnep_sock_init(); } static void __exit bnep_exit(void) { bnep_sock_cleanup(); } module_init(bnep_init); module_exit(bnep_exit); module_param(compress_src, bool, 0644); MODULE_PARM_DESC(compress_src, "Compress sources headers"); module_param(compress_dst, bool, 0644); MODULE_PARM_DESC(compress_dst, "Compress destination headers"); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Bluetooth BNEP ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS("bt-proto-4");
72 72 67 67 40 40 63 63 63 63 57 63 57 57 57 63 62 63 62 62 63 5 5 5 5 5 5 5 5 5 5 5 7 4 7 7 3 5 36 1 1 1 14 15 14 15 1 33 34 2 2 2 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 /* * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/slab.h> #include <linux/types.h> #include <linux/rbtree.h> #include <linux/bitops.h> #include <linux/export.h> #include "rds.h" /* * This file implements the receive side of the unconventional congestion * management in RDS. * * Messages waiting in the receive queue on the receiving socket are accounted * against the sockets SO_RCVBUF option value. Only the payload bytes in the * message are accounted for. If the number of bytes queued equals or exceeds * rcvbuf then the socket is congested. All sends attempted to this socket's * address should return block or return -EWOULDBLOCK. * * Applications are expected to be reasonably tuned such that this situation * very rarely occurs. An application encountering this "back-pressure" is * considered a bug. * * This is implemented by having each node maintain bitmaps which indicate * which ports on bound addresses are congested. As the bitmap changes it is * sent through all the connections which terminate in the local address of the * bitmap which changed. * * The bitmaps are allocated as connections are brought up. This avoids * allocation in the interrupt handling path which queues messages on sockets. * The dense bitmaps let transports send the entire bitmap on any bitmap change * reasonably efficiently. This is much easier to implement than some * finer-grained communication of per-port congestion. The sender does a very * inexpensive bit test to test if the port it's about to send to is congested * or not. */ /* * Interaction with poll is a tad tricky. We want all processes stuck in * poll to wake up and check whether a congested destination became uncongested. * The really sad thing is we have no idea which destinations the application * wants to send to - we don't even know which rds_connections are involved. * So until we implement a more flexible rds poll interface, we have to make * do with this: * We maintain a global counter that is incremented each time a congestion map * update is received. Each rds socket tracks this value, and if rds_poll * finds that the saved generation number is smaller than the global generation * number, it wakes up the process. */ static atomic_t rds_cong_generation = ATOMIC_INIT(0); /* * Congestion monitoring */ static LIST_HEAD(rds_cong_monitor); static DEFINE_RWLOCK(rds_cong_monitor_lock); /* * Yes, a global lock. It's used so infrequently that it's worth keeping it * global to simplify the locking. It's only used in the following * circumstances: * * - on connection buildup to associate a conn with its maps * - on map changes to inform conns of a new map to send * * It's sadly ordered under the socket callback lock and the connection lock. * Receive paths can mark ports congested from interrupt context so the * lock masks interrupts. */ static DEFINE_SPINLOCK(rds_cong_lock); static struct rb_root rds_cong_tree = RB_ROOT; static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr, struct rds_cong_map *insert) { struct rb_node **p = &rds_cong_tree.rb_node; struct rb_node *parent = NULL; struct rds_cong_map *map; while (*p) { int diff; parent = *p; map = rb_entry(parent, struct rds_cong_map, m_rb_node); diff = rds_addr_cmp(addr, &map->m_addr); if (diff < 0) p = &(*p)->rb_left; else if (diff > 0) p = &(*p)->rb_right; else return map; } if (insert) { rb_link_node(&insert->m_rb_node, parent, p); rb_insert_color(&insert->m_rb_node, &rds_cong_tree); } return NULL; } /* * There is only ever one bitmap for any address. Connections try and allocate * these bitmaps in the process getting pointers to them. The bitmaps are only * ever freed as the module is removed after all connections have been freed. */ static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) { struct rds_cong_map *map; struct rds_cong_map *ret = NULL; unsigned long zp; unsigned long i; unsigned long flags; map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); if (!map) return NULL; map->m_addr = *addr; init_waitqueue_head(&map->m_waitq); INIT_LIST_HEAD(&map->m_conn_list); for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { zp = get_zeroed_page(GFP_KERNEL); if (zp == 0) goto out; map->m_page_addrs[i] = zp; } spin_lock_irqsave(&rds_cong_lock, flags); ret = rds_cong_tree_walk(addr, map); spin_unlock_irqrestore(&rds_cong_lock, flags); if (!ret) { ret = map; map = NULL; } out: if (map) { for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) free_page(map->m_page_addrs[i]); kfree(map); } rdsdebug("map %p for addr %pI6c\n", ret, addr); return ret; } /* * Put the conn on its local map's list. This is called when the conn is * really added to the hash. It's nested under the rds_conn_lock, sadly. */ void rds_cong_add_conn(struct rds_connection *conn) { unsigned long flags; rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); spin_lock_irqsave(&rds_cong_lock, flags); list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); spin_unlock_irqrestore(&rds_cong_lock, flags); } void rds_cong_remove_conn(struct rds_connection *conn) { unsigned long flags; rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); spin_lock_irqsave(&rds_cong_lock, flags); list_del_init(&conn->c_map_item); spin_unlock_irqrestore(&rds_cong_lock, flags); } int rds_cong_get_maps(struct rds_connection *conn) { conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; return 0; } void rds_cong_queue_updates(struct rds_cong_map *map) { struct rds_connection *conn; unsigned long flags; spin_lock_irqsave(&rds_cong_lock, flags); list_for_each_entry(conn, &map->m_conn_list, c_map_item) { struct rds_conn_path *cp = &conn->c_path[0]; rcu_read_lock(); if (!test_and_set_bit(0, &conn->c_map_queued) && !rds_destroy_pending(cp->cp_conn)) { rds_stats_inc(s_cong_update_queued); /* We cannot inline the call to rds_send_xmit() here * for two reasons (both pertaining to a TCP transport): * 1. When we get here from the receive path, we * are already holding the sock_lock (held by * tcp_v4_rcv()). So inlining calls to * tcp_setsockopt and/or tcp_sendmsg will deadlock * when it tries to get the sock_lock()) * 2. Interrupts are masked so that we can mark the * port congested from both send and recv paths. * (See comment around declaration of rdc_cong_lock). * An attempt to get the sock_lock() here will * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ queue_delayed_work(rds_wq, &cp->cp_send_w, 0); } rcu_read_unlock(); } spin_unlock_irqrestore(&rds_cong_lock, flags); } void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) { rdsdebug("waking map %p for %pI4\n", map, &map->m_addr); rds_stats_inc(s_cong_update_received); atomic_inc(&rds_cong_generation); if (waitqueue_active(&map->m_waitq)) wake_up(&map->m_waitq); if (waitqueue_active(&rds_poll_waitq)) wake_up_all(&rds_poll_waitq); if (portmask && !list_empty(&rds_cong_monitor)) { unsigned long flags; struct rds_sock *rs; read_lock_irqsave(&rds_cong_monitor_lock, flags); list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { spin_lock(&rs->rs_lock); rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); rs->rs_cong_mask &= ~portmask; spin_unlock(&rs->rs_lock); if (rs->rs_cong_notify) rds_wake_sk_sleep(rs); } read_unlock_irqrestore(&rds_cong_monitor_lock, flags); } } EXPORT_SYMBOL_GPL(rds_cong_map_updated); int rds_cong_updated_since(unsigned long *recent) { unsigned long gen = atomic_read(&rds_cong_generation); if (likely(*recent == gen)) return 0; *recent = gen; return 1; } /* * We're called under the locking that protects the sockets receive buffer * consumption. This makes it a lot easier for the caller to only call us * when it knows that an existing set bit needs to be cleared, and vice versa. * We can't block and we need to deal with concurrent sockets working against * the same per-address map. */ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; rdsdebug("setting congestion for %pI4:%u in map %p\n", &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; rdsdebug("clearing congestion for %pI4:%u in map %p\n", &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; return test_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) { unsigned long flags; write_lock_irqsave(&rds_cong_monitor_lock, flags); if (list_empty(&rs->rs_cong_list)) list_add(&rs->rs_cong_list, &rds_cong_monitor); write_unlock_irqrestore(&rds_cong_monitor_lock, flags); } void rds_cong_remove_socket(struct rds_sock *rs) { unsigned long flags; struct rds_cong_map *map; write_lock_irqsave(&rds_cong_monitor_lock, flags); list_del_init(&rs->rs_cong_list); write_unlock_irqrestore(&rds_cong_monitor_lock, flags); /* update congestion map for now-closed port */ spin_lock_irqsave(&rds_cong_lock, flags); map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); spin_unlock_irqrestore(&rds_cong_lock, flags); if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { rds_cong_clear_bit(map, rs->rs_bound_port); rds_cong_queue_updates(map); } } int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs) { if (!rds_cong_test_bit(map, port)) return 0; if (nonblock) { if (rs && rs->rs_cong_monitor) { unsigned long flags; /* It would have been nice to have an atomic set_bit on * a uint64_t. */ spin_lock_irqsave(&rs->rs_lock, flags); rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); spin_unlock_irqrestore(&rs->rs_lock, flags); /* Test again - a congestion update may have arrived in * the meantime. */ if (!rds_cong_test_bit(map, port)) return 0; } rds_stats_inc(s_cong_send_error); return -ENOBUFS; } rds_stats_inc(s_cong_send_blocked); rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); return wait_event_interruptible(map->m_waitq, !rds_cong_test_bit(map, port)); } void rds_cong_exit(void) { struct rb_node *node; struct rds_cong_map *map; unsigned long i; while ((node = rb_first(&rds_cong_tree))) { map = rb_entry(node, struct rds_cong_map, m_rb_node); rdsdebug("freeing map %p\n", map); rb_erase(&map->m_rb_node, &rds_cong_tree); for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) free_page(map->m_page_addrs[i]); kfree(map); } } /* * Allocate a RDS message containing a congestion update. */ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) { struct rds_cong_map *map = conn->c_lcong; struct rds_message *rm; rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); if (!IS_ERR(rm)) rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; return rm; }
7734 1437 1323 7724 7833 37 3270 6273 897 895 897 835 67 67 894 899 896 36 67 36 71 6221 6225 236 190 6149 3270 3850 4895 71 4907 4896 2744 4906 25 4877 2414 2407 682 2414 155 155 60 157 129 129 53 53 53 23 23 23 75 76 36 76 3407 3410 3408 3416 3405 3200 6274 3200 87 82 251 3548 3541 2462 1 6255 380 6253 124 122 6254 6256 4765 2573 6223 6006 1171 6230 3542 6227 3553 6224 6223 6243 3519 6252 6266 6260 7618 4108 4103 2632 4137 2628 2413 2079 607 74 5 5 74 12510 7608 7583 7603 2535 6217 12656 12496 12519 11357 6200 6193 130 127 9 53 53 37 53 2397 2395 2399 858 860 1171 157 99 130 132 22 93 92 88 5 379 112 112 112 378 5 5 5 1 5 5 126 45 125 2744 129 9 4 126 126 23 23 22 23 23 23 23 23 53 53 53 243 244 244 244 243 1147 1146 1142 1140 58 58 20 56 13 55 1143 13 13 13 13 12 12 1147 1145 1143 5 5 2 5 2385 2307 2 2304 2378 2276 2378 2380 2377 2382 1139 58 58 52 21 58 52 1138 1140 52 1136 52 173 173 173 172 173 173 909 1 907 913 958 956 910 908 908 1 913 1 1 11646 11640 2918 11447 570 569 11203 11444 11670 8262 11642 11649 6961 6959 6972 6955 451 14 14 14 14 7552 7537 3778 7540 179 11648 11650 11642 11051 980 11046 923 11039 2894 11053 282 11037 5326 11053 419 11042 274 10883 10883 5041 5035 5032 10197 7355 7132 9987 10906 2 9840 9834 22 9863 9865 9644 9647 9655 478 476 478 478 2489 2492 2489 2494 64 59 43 45 31 33 33 33 32 33 33 45 64 63 1329 1335 1336 623 204 204 172 173 172 173 622 9767 620 9474 7799 7402 7391 7416 9779 6372 6372 6209 6300 6657 2219 2222 1323 1327 1319 4 6648 102 102 166 362 166 166 212 212 6833 6851 121 5843 5845 5842 7 7 7 5955 5954 14 5953 9 3 9 9 5960 5955 5962 5956 1 5950 13 7 7 7 7 7 7 7 7 4 4 5953 5952 5968 730 3 5951 5954 5947 5947 5951 217 573 6791 5834 5849 5834 1590 2227 2227 2231 5680 5834 572 2225 4670 183 4673 2 1 1 62 2 60 63 371 37 371 60 323 371 2 410 422 422 424 420 424 421 9 413 63 369 370 50 422 14 14 13 424 413 421 409 422 371 371 359 63 63 63 63 63 423 412 424 424 424 424 13 421 108 423 413 423 358 61 63 63 62 63 502 498 3 3 3 2 3 3 3511 3438 3443 2076 1150 15 15 2 2 13 3 3 3 3 9 15 3427 3513 267 267 187 187 187 187 266 42 16 42 15 42 211 211 211 210 411 410 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" #include <asm/runtime-const.h> /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); const struct qstr dotdot_name = QSTR_INIT("..", 2); EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. * * Marking the variables "used" ensures that the compiler doesn't * optimize them away completely on architectures with runtime * constant infrastructure, this allows debuggers to see their * values. But updating these values has no effect on those arches. */ static unsigned int d_hash_shift __ro_after_init __used; static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { return runtime_const_ptr(dentry_hashtable) + runtime_const_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ static struct dentry_stat_t dentry_stat = { .age_limit = 45, }; /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, { .procname = "dentry-negative", .data = &dentry_negative_policy, .maxlen = sizeof(dentry_negative_policy), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static int __init init_fs_dcache_sysctls(void) { register_sysctl_init("fs", fs_dcache_sysctls); return 0; } fs_initcall(init_fs_dcache_sysctls); #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } /* * long names are allocated separately from dentry and never modified. * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot() * for the reason why ->count and ->head can't be combined into a union. * dentry_string_cmp() relies upon ->name[] being word-aligned. */ struct external_name { atomic_t count; struct rcu_head head; unsigned char name[] __aligned(sizeof(unsigned long)); }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_shortname.string; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { unsigned seq; const unsigned char *s; rcu_read_lock(); retry: seq = read_seqcount_begin(&dentry->d_seq); s = READ_ONCE(dentry->d_name.name); name->name.hash_len = dentry->d_name.hash_len; name->name.name = name->inline_name.string; if (likely(s == dentry->d_shortname.string)) { name->inline_name = dentry->d_shortname; } else { struct external_name *p; p = container_of(s, struct external_name, name[0]); // get a valid reference if (unlikely(!atomic_inc_not_zero(&p->count))) goto retry; name->name.name = s; } if (read_seqcount_retry(&dentry->d_seq, seq)) { release_dentry_name_snapshot(name); goto retry; } rcu_read_unlock(); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name.string)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->count))) kfree_rcu(p, head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; /* * The negative counter only tracks dentries on the LRU. Don't inc if * d_lru is on another list. */ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_sib.next) { next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_sib.next = next->d_sib.next; } } static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); cond_resched(); /* now that it's negative, ->d_parent is stable */ if (!IS_ROOT(dentry)) { parent = dentry->d_parent; spin_lock(&parent->d_lock); } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_unlist(dentry); if (dentry->d_flags & DCACHE_SHRINK_LIST) can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); return NULL; } return parent; } /* * Lock a dentry for feeding it to __dentry_kill(). * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry is busy. Otherwise, return true and have * that dentry's inode locked. */ static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; if (unlikely(dentry->d_lockref.count)) return false; if (!inode || likely(spin_trylock(&inode->i_lock))) return true; do { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (likely(inode == dentry->d_inode)) break; spin_unlock(&inode->i_lock); inode = dentry->d_inode; } while (inode); if (likely(!dentry->d_lockref.count)) return true; if (inode) spin_unlock(&inode->i_lock); return false; } /* * Decide if dentry is worth retaining. Usually this is called with dentry * locked; if not locked, we are more limited and might not be able to tell * without a lock. False in this case means "punt to locked path and recheck". * * In case we aren't locked, these predicates are not "stable". However, it is * sufficient that at some point after we dropped the reference the dentry was * hashed and the flags had the proper value. Other dentry users may have * re-gotten a reference to the dentry and change that, but our work is done - * we can leave the dentry around with a zero refcount. */ static inline bool retain_dentry(struct dentry *dentry, bool locked) { unsigned int d_flags; smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; // Same if it's disconnected if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; // ->d_delete() might tell us not to bother, but that requires // ->d_lock; can't decide without it if (unlikely(d_flags & DCACHE_OP_DELETE)) { if (!locked || dentry->d_op->d_delete(dentry)) return false; } // Explicitly told not to bother if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; // At this point it looks like we ought to keep it. We also might // need to do something - put it on LRU if it wasn't there already // and mark it referenced if it was on LRU, but not marked yet. // Unfortunately, both actions require ->d_lock, so in lockless // case we'd have to punt rather than doing those. if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { if (!locked) return false; d_lru_add(dentry); } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { if (!locked) return false; dentry->d_flags |= DCACHE_REFERENCED; } return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode->i_state |= I_DONTCACHE; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * In that case refcount is guaranteed to be zero and we have already * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; /* * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } dentry->d_lockref.count--; goto locked; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Can we decide that decrement of refcount is all we needed without * taking the lock? There's a very common case when it's all we need - * dentry looks like it ought to be retained and there's nothing else * to do. */ if (retain_dentry(dentry, false)) return true; /* * Either not worth retaining or we can't tell without the lock. * Get the lock, then. We've already decremented the refcount to 0, * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ locked: if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } return false; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { if (!dentry) return; might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } while (lock_for_kill(dentry)) { rcu_read_unlock(); dentry = __dentry_kill(dentry); if (!dentry) return; if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } rcu_read_unlock(); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); lockref_get(&alias->d_lockref); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Caller MUST be holding rcu_read_lock() and be guaranteed * that inode won't get freed until rcu_read_unlock(). */ struct dentry *d_find_alias_rcu(struct inode *inode) { struct hlist_head *l = &inode->i_dentry; struct dentry *de = NULL; spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { hlist_for_each_entry(de, l, d_u.d_alias) if (!d_unhashed(de)) break; } } spin_unlock(&inode->i_lock); return de; } /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { LIST_HEAD(dispose); struct dentry *dentry; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { do { rcu_read_unlock(); victim = __dentry_kill(victim); rcu_read_lock(); } while (victim && lock_for_kill(victim)); rcu_read_unlock(); if (victim) spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!lock_for_kill(dentry)) { bool can_free; rcu_read_unlock(); d_shrink_del(dentry); can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } d_shrink_del(dentry); shrink_kill(dentry); } } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); l