Total coverage: 229773 (12%)of 2011305
44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 // SPDX-License-Identifier: GPL-2.0 /* * Filesystem-level keyring for fscrypt * * Copyright 2019 Google LLC */ /* * This file implements management of fscrypt master keys in the * filesystem-level keyring, including the ioctls: * * - FS_IOC_ADD_ENCRYPTION_KEY * - FS_IOC_REMOVE_ENCRYPTION_KEY * - FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS * - FS_IOC_GET_ENCRYPTION_KEY_STATUS * * See the "User API" section of Documentation/filesystems/fscrypt.rst for more * information about these ioctls. */ #include <crypto/skcipher.h> #include <linux/export.h> #include <linux/key-type.h> #include <linux/once.h> #include <linux/random.h> #include <linux/seq_file.h> #include <linux/unaligned.h> #include "fscrypt_private.h" /* The master encryption keys for a filesystem (->s_master_keys) */ struct fscrypt_keyring { /* * Lock that protects ->key_hashtable. It does *not* protect the * fscrypt_master_key structs themselves. */ spinlock_t lock; /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */ struct hlist_head key_hashtable[128]; }; static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { memzero_explicit(secret, sizeof(*secret)); } static void move_master_key_secret(struct fscrypt_master_key_secret *dst, struct fscrypt_master_key_secret *src) { memcpy(dst, src, sizeof(*dst)); memzero_explicit(src, sizeof(*src)); } static void fscrypt_free_master_key(struct rcu_head *head) { struct fscrypt_master_key *mk = container_of(head, struct fscrypt_master_key, mk_rcu_head); /* * The master key secret and any embedded subkeys should have already * been wiped when the last active reference to the fscrypt_master_key * struct was dropped; doing it here would be unnecessarily late. * Nevertheless, use kfree_sensitive() in case anything was missed. */ kfree_sensitive(mk); } void fscrypt_put_master_key(struct fscrypt_master_key *mk) { if (!refcount_dec_and_test(&mk->mk_struct_refs)) return; /* * No structural references left, so free ->mk_users, and also free the * fscrypt_master_key struct itself after an RCU grace period ensures * that concurrent keyring lookups can no longer find it. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 0); if (mk->mk_users) { /* Clear the keyring so the quota gets released right away. */ keyring_clear(mk->mk_users); key_put(mk->mk_users); mk->mk_users = NULL; } call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk) { size_t i; if (!refcount_dec_and_test(&mk->mk_active_refs)) return; /* * No active references left, so complete the full removal of this * fscrypt_master_key struct by removing it from the keyring and * destroying any subkeys embedded in it. */ if (WARN_ON_ONCE(!sb->s_master_keys)) return; spin_lock(&sb->s_master_keys->lock); hlist_del_rcu(&mk->mk_node); spin_unlock(&sb->s_master_keys->lock); /* * ->mk_active_refs == 0 implies that ->mk_present is false and * ->mk_decrypted_inodes is empty. */ WARN_ON_ONCE(mk->mk_present); WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes)); for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { fscrypt_destroy_prepared_key( sb, &mk->mk_direct_keys[i]); fscrypt_destroy_prepared_key( sb, &mk->mk_iv_ino_lblk_64_keys[i]); fscrypt_destroy_prepared_key( sb, &mk->mk_iv_ino_lblk_32_keys[i]); } memzero_explicit(&mk->mk_ino_hash_key, sizeof(mk->mk_ino_hash_key)); mk->mk_ino_hash_key_initialized = false; /* Drop the structural ref associated with the active refs. */ fscrypt_put_master_key(mk); } /* * This transitions the key state from present to incompletely removed, and then * potentially to absent (depending on whether inodes remain). */ static void fscrypt_initiate_key_removal(struct super_block *sb, struct fscrypt_master_key *mk) { WRITE_ONCE(mk->mk_present, false); wipe_master_key_secret(&mk->mk_secret); fscrypt_put_master_key_activeref(sb, mk); } static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) { if (spec->__reserved) return false; return master_key_spec_len(spec) != 0; } static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { /* * We just charge FSCRYPT_MAX_RAW_KEY_SIZE bytes to the user's key quota * for each key, regardless of the exact key size. The amount of memory * actually used is greater than the size of the raw key anyway. */ return key_payload_reserve(key, FSCRYPT_MAX_RAW_KEY_SIZE); } static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); } /* * Type of key in ->mk_users. Each key of this type represents a particular * user who has added a particular master key. * * Note that the name of this key type really should be something like * ".fscrypt-user" instead of simply ".fscrypt". But the shorter name is chosen * mainly for simplicity of presentation in /proc/keys when read by a non-root * user. And it is expected to be rare that a key is actually added by multiple * users, since users should keep their encryption keys confidential. */ static struct key_type key_type_fscrypt_user = { .name = ".fscrypt", .instantiate = fscrypt_user_key_instantiate, .describe = fscrypt_user_key_describe, }; #define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ CONST_STRLEN("-users") + 1) #define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) static void format_mk_users_keyring_description( char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { sprintf(description, "fscrypt-%*phN-users", FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier); } static void format_mk_user_description( char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { sprintf(description, "%*phN.uid.%u", FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier, __kuid_val(current_fsuid())); } /* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ static int allocate_filesystem_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring; if (sb->s_master_keys) return 0; keyring = kzalloc(sizeof(*keyring), GFP_KERNEL); if (!keyring) return -ENOMEM; spin_lock_init(&keyring->lock); /* * Pairs with the smp_load_acquire() in fscrypt_find_master_key(). * I.e., here we publish ->s_master_keys with a RELEASE barrier so that * concurrent tasks can ACQUIRE it. */ smp_store_release(&sb->s_master_keys, keyring); return 0; } /* * Release all encryption keys that have been added to the filesystem, along * with the keyring that contains them. * * This is called at unmount time, after all potentially-encrypted inodes have * been evicted. The filesystem's underlying block device(s) are still * available at this time; this is important because after user file accesses * have been allowed, this function may need to evict keys from the keyslots of * an inline crypto engine, which requires the block device(s). */ void fscrypt_destroy_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; if (!keyring) return; for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) { struct hlist_head *bucket = &keyring->key_hashtable[i]; struct fscrypt_master_key *mk; struct hlist_node *tmp; hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) { /* * Since all potentially-encrypted inodes were already * evicted, every key remaining in the keyring should * have an empty inode list, and should only still be in * the keyring due to the single active ref associated * with ->mk_present. There should be no structural * refs beyond the one associated with the active ref. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1); WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1); WARN_ON_ONCE(!mk->mk_present); fscrypt_initiate_key_removal(sb, mk); } } kfree_sensitive(keyring); sb->s_master_keys = NULL; } static struct hlist_head * fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring, const struct fscrypt_key_specifier *mk_spec) { /* * Since key specifiers should be "random" values, it is sufficient to * use a trivial hash function that just takes the first several bits of * the key specifier. */ unsigned long i = get_unaligned((unsigned long *)&mk_spec->u); return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)]; } /* * Find the specified master key struct in ->s_master_keys and take a structural * ref to it. The structural ref guarantees that the key struct continues to * exist, but it does *not* guarantee that ->s_master_keys continues to contain * the key struct. The structural ref needs to be dropped by * fscrypt_put_master_key(). Returns NULL if the key struct is not found. */ struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec) { struct fscrypt_keyring *keyring; struct hlist_head *bucket; struct fscrypt_master_key *mk; /* * Pairs with the smp_store_release() in allocate_filesystem_keyring(). * I.e., another task can publish ->s_master_keys concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ keyring = smp_load_acquire(&sb->s_master_keys); if (keyring == NULL) return NULL; /* No keyring yet, so no keys yet. */ bucket = fscrypt_mk_hash_bucket(keyring, mk_spec); rcu_read_lock(); switch (mk_spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: hlist_for_each_entry_rcu(mk, bucket, mk_node) { if (mk->mk_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && memcmp(mk->mk_spec.u.descriptor, mk_spec->u.descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 && refcount_inc_not_zero(&mk->mk_struct_refs)) goto out; } break; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: hlist_for_each_entry_rcu(mk, bucket, mk_node) { if (mk->mk_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && memcmp(mk->mk_spec.u.identifier, mk_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 && refcount_inc_not_zero(&mk->mk_struct_refs)) goto out; } break; } mk = NULL; out: rcu_read_unlock(); return mk; } static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE]; struct key *keyring; format_mk_users_keyring_description(description, mk->mk_spec.u.identifier); keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); mk->mk_users = keyring; return 0; } /* * Find the current user's "key" in the master key's ->mk_users. * Returns ERR_PTR(-ENOKEY) if not found. */ static struct key *find_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; key_ref_t keyref; format_mk_user_description(description, mk->mk_spec.u.identifier); /* * We need to mark the keyring reference as "possessed" so that we * acquire permission to search it, via the KEY_POS_SEARCH permission. */ keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/), &key_type_fscrypt_user, description, false); if (IS_ERR(keyref)) { if (PTR_ERR(keyref) == -EAGAIN || /* not found */ PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ keyref = ERR_PTR(-ENOKEY); return ERR_CAST(keyref); } return key_ref_to_ptr(keyref); } /* * Give the current user a "key" in ->mk_users. This charges the user's quota * and marks the master key as added by the current user, so that it cannot be * removed by another user with the key. Either ->mk_sem must be held for * write, or the master key must be still undergoing initialization. */ static int add_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; struct key *mk_user; int err; format_mk_user_description(description, mk->mk_spec.u.identifier); mk_user = key_alloc(&key_type_fscrypt_user, description, current_fsuid(), current_gid(), current_cred(), KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL); if (IS_ERR(mk_user)) return PTR_ERR(mk_user); err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL); key_put(mk_user); return err; } /* * Remove the current user's "key" from ->mk_users. * ->mk_sem must be held for write. * * Returns 0 if removed, -ENOKEY if not found, or another -errno code. */ static int remove_master_key_user(struct fscrypt_master_key *mk) { struct key *mk_user; int err; mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) return PTR_ERR(mk_user); err = key_unlink(mk->mk_users, mk_user); key_put(mk_user); return err; } /* * Allocate a new fscrypt_master_key, transfer the given secret over to it, and * insert it into sb->s_master_keys. */ static int add_new_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, const struct fscrypt_key_specifier *mk_spec) { struct fscrypt_keyring *keyring = sb->s_master_keys; struct fscrypt_master_key *mk; int err; mk = kzalloc(sizeof(*mk), GFP_KERNEL); if (!mk) return -ENOMEM; init_rwsem(&mk->mk_sem); refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; INIT_LIST_HEAD(&mk->mk_decrypted_inodes); spin_lock_init(&mk->mk_decrypted_inodes_lock); if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { err = allocate_master_key_users_keyring(mk); if (err) goto out_put; err = add_master_key_user(mk); if (err) goto out_put; } move_master_key_secret(&mk->mk_secret, secret); mk->mk_present = true; refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */ spin_lock(&keyring->lock); hlist_add_head_rcu(&mk->mk_node, fscrypt_mk_hash_bucket(keyring, mk_spec)); spin_unlock(&keyring->lock); return 0; out_put: fscrypt_put_master_key(mk); return err; } #define KEY_DEAD 1 static int add_existing_master_key(struct fscrypt_master_key *mk, struct fscrypt_master_key_secret *secret) { int err; /* * If the current user is already in ->mk_users, then there's nothing to * do. Otherwise, we need to add the user to ->mk_users. (Neither is * applicable for v1 policy keys, which have NULL ->mk_users.) */ if (mk->mk_users) { struct key *mk_user = find_master_key_user(mk); if (mk_user != ERR_PTR(-ENOKEY)) { if (IS_ERR(mk_user)) return PTR_ERR(mk_user); key_put(mk_user); return 0; } err = add_master_key_user(mk); if (err) return err; } /* If the key is incompletely removed, make it present again. */ if (!mk->mk_present) { if (!refcount_inc_not_zero(&mk->mk_active_refs)) { /* * Raced with the last active ref being dropped, so the * key has become, or is about to become, "absent". * Therefore, we need to allocate a new key struct. */ return KEY_DEAD; } move_master_key_secret(&mk->mk_secret, secret); WRITE_ONCE(mk->mk_present, true); } return 0; } static int do_add_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); struct fscrypt_master_key *mk; int err; mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ mk = fscrypt_find_master_key(sb, mk_spec); if (!mk) { /* Didn't find the key in ->s_master_keys. Add it. */ err = allocate_filesystem_keyring(sb); if (!err) err = add_new_master_key(sb, secret, mk_spec); } else { /* * Found the key in ->s_master_keys. Add the user to ->mk_users * if needed, and make the key "present" again if possible. */ down_write(&mk->mk_sem); err = add_existing_master_key(mk, secret); up_write(&mk->mk_sem); if (err == KEY_DEAD) { /* * We found a key struct, but it's already been fully * removed. Ignore the old struct and add a new one. * fscrypt_add_key_mutex means we don't need to worry * about concurrent adds. */ err = add_new_master_key(sb, secret, mk_spec); } fscrypt_put_master_key(mk); } mutex_unlock(&fscrypt_add_key_mutex); return err; } static int add_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, struct fscrypt_key_specifier *key_spec) { int err; if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]; u8 *kdf_key = secret->bytes; unsigned int kdf_key_size = secret->size; u8 keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY; /* * For raw keys, the fscrypt master key is used directly as the * fscrypt KDF key. For hardware-wrapped keys, we have to pass * the master key to the hardware to derive the KDF key, which * is then only used to derive non-file-contents subkeys. */ if (secret->is_hw_wrapped) { err = fscrypt_derive_sw_secret(sb, secret->bytes, secret->size, sw_secret); if (err) return err; kdf_key = sw_secret; kdf_key_size = sizeof(sw_secret); /* * To avoid weird behavior if someone manages to * determine sw_secret and add it as a raw key, ensure * that hardware-wrapped keys and raw keys will have * different key identifiers by deriving their key * identifiers using different KDF contexts. */ keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY; } fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); /* * Now that the KDF context is initialized, the raw KDF key is * no longer needed. */ memzero_explicit(kdf_key, kdf_key_size); /* Calculate the key identifier */ fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, key_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); } return do_add_master_key(sb, secret, key_spec); } /* * Validate the size of an fscrypt master key being added. Note that this is * just an initial check, as we don't know which ciphers will be used yet. * There is a stricter size check later when the key is actually used by a file. */ static inline bool fscrypt_valid_key_size(size_t size, u32 add_key_flags) { u32 max_size = (add_key_flags & FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) ? FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE : FSCRYPT_MAX_RAW_KEY_SIZE; return size >= FSCRYPT_MIN_KEY_SIZE && size <= max_size; } static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; if (prep->datalen < sizeof(*payload)) return -EINVAL; if (!fscrypt_valid_key_size(prep->datalen - sizeof(*payload), payload->flags)) return -EINVAL; if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) return -EINVAL; if (payload->flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) return -EINVAL; prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL); if (!prep->payload.data[0]) return -ENOMEM; prep->quotalen = prep->datalen; return 0; } static void fscrypt_provisioning_key_free_preparse( struct key_preparsed_payload *prep) { kfree_sensitive(prep->payload.data[0]); } static void fscrypt_provisioning_key_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); if (key_is_positive(key)) { const struct fscrypt_provisioning_key_payload *payload = key->payload.data[0]; seq_printf(m, ": %u [%u]", key->datalen, payload->type); } } static void fscrypt_provisioning_key_destroy(struct key *key) { kfree_sensitive(key->payload.data[0]); } static struct key_type key_type_fscrypt_provisioning = { .name = "fscrypt-provisioning", .preparse = fscrypt_provisioning_key_preparse, .free_preparse = fscrypt_provisioning_key_free_preparse, .instantiate = generic_key_instantiate, .describe = fscrypt_provisioning_key_describe, .destroy = fscrypt_provisioning_key_destroy, }; /* * Retrieve the key from the Linux keyring key specified by 'key_id', and store * it into 'secret'. * * The key must be of type "fscrypt-provisioning" and must have the 'type' and * 'flags' field of the payload set to the given values, indicating that the key * is intended for use for the specified purpose. We don't use the "logon" key * type because there's no way to completely restrict the use of such keys; they * can be used by any kernel API that accepts "logon" keys and doesn't require a * specific service prefix. * * The ability to specify the key via Linux keyring key is intended for cases * where userspace needs to re-add keys after the filesystem is unmounted and * re-mounted. Most users should just provide the key directly instead. */ static int get_keyring_key(u32 key_id, u32 type, u32 flags, struct fscrypt_master_key_secret *secret) { key_ref_t ref; struct key *key; const struct fscrypt_provisioning_key_payload *payload; int err; ref = lookup_user_key(key_id, 0, KEY_NEED_SEARCH); if (IS_ERR(ref)) return PTR_ERR(ref); key = key_ref_to_ptr(ref); if (key->type != &key_type_fscrypt_provisioning) goto bad_key; payload = key->payload.data[0]; /* * Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. * Similarly, don't allow hardware-wrapped keys to be used as * non-hardware-wrapped keys and vice versa. */ if (payload->type != type || payload->flags != flags) goto bad_key; secret->size = key->datalen - sizeof(*payload); memcpy(secret->bytes, payload->raw, secret->size); err = 0; goto out_put; bad_key: err = -EKEYREJECTED; out_put: key_ref_put(ref); return err; } /* * Add a master encryption key to the filesystem, causing all files which were * encrypted with it to appear "unlocked" (decrypted) when accessed. * * When adding a key for use by v1 encryption policies, this ioctl is * privileged, and userspace must provide the 'key_descriptor'. * * When adding a key for use by v2+ encryption policies, this ioctl is * unprivileged. This is needed, in general, to allow non-root users to use * encryption without encountering the visibility problems of process-subscribed * keyrings and the inability to properly remove keys. This works by having * each key identified by its cryptographically secure hash --- the * 'key_identifier'. The cryptographic hash ensures that a malicious user * cannot add the wrong key for a given identifier. Furthermore, each added key * is charged to the appropriate user's quota for the keyrings service, which * prevents a malicious user from adding too many keys. Finally, we forbid a * user from removing a key while other users have added it too, which prevents * a user who knows another user's key from causing a denial-of-service by * removing it at an inopportune time. (We tolerate that a user who knows a key * can prevent other users from removing it.) * * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of * Documentation/filesystems/fscrypt.rst. */ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_add_key_arg __user *uarg = _uarg; struct fscrypt_add_key_arg arg; struct fscrypt_master_key_secret secret; int err; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; /* * Only root can add keys that are identified by an arbitrary descriptor * rather than by a cryptographic hash --- since otherwise a malicious * user could add the wrong key. */ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && !capable(CAP_SYS_ADMIN)) return -EACCES; memset(&secret, 0, sizeof(secret)); if (arg.flags) { if (arg.flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) return -EINVAL; if (arg.key_spec.type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) return -EINVAL; secret.is_hw_wrapped = true; } if (arg.key_id) { if (arg.raw_size != 0) return -EINVAL; err = get_keyring_key(arg.key_id, arg.key_spec.type, arg.flags, &secret); if (err) goto out_wipe_secret; } else { if (!fscrypt_valid_key_size(arg.raw_size, arg.flags)) return -EINVAL; secret.size = arg.raw_size; err = -EFAULT; if (copy_from_user(secret.bytes, uarg->raw, secret.size)) goto out_wipe_secret; } err = add_master_key(sb, &secret, &arg.key_spec); if (err) goto out_wipe_secret; /* Return the key identifier to userspace, if applicable */ err = -EFAULT; if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE)) goto out_wipe_secret; err = 0; out_wipe_secret: wipe_master_key_secret(&secret); return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); static void fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) { static u8 test_key[FSCRYPT_MAX_RAW_KEY_SIZE]; get_random_once(test_key, sizeof(test_key)); memset(secret, 0, sizeof(*secret)); secret->size = sizeof(test_key); memcpy(secret->bytes, test_key, sizeof(test_key)); } void fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_master_key_secret secret; fscrypt_get_test_dummy_secret(&secret); fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0, key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); wipe_master_key_secret(&secret); } /** * fscrypt_add_test_dummy_key() - add the test dummy encryption key * @sb: the filesystem instance to add the key to * @key_spec: the key specifier of the test dummy encryption key * * Add the key for the test_dummy_encryption mount option to the filesystem. To * prevent misuse of this mount option, a per-boot random key is used instead of * a hardcoded one. This makes it so that any encrypted files created using * this option won't be accessible after a reboot. * * Return: 0 on success, -errno on failure */ int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec) { struct fscrypt_master_key_secret secret; int err; fscrypt_get_test_dummy_secret(&secret); err = add_master_key(sb, &secret, key_spec); wipe_master_key_secret(&secret); return err; } /* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting * their files using some other user's key which they don't actually know. * Cryptographically this isn't much of a problem, but the semantics of this * would be a bit weird, so it's best to just forbid it. * * The system administrator (CAP_FOWNER) can override this, which should be * enough for any use cases where encryption policies are being set using keys * that were chosen ahead of time but aren't available at the moment. * * Note that the key may have already removed by the time this returns, but * that's okay; we just care whether the key was there at some point. * * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code */ int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; struct key *mk_user; int err; mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); mk = fscrypt_find_master_key(sb, &mk_spec); if (!mk) { err = -ENOKEY; goto out; } down_read(&mk->mk_sem); mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) { err = PTR_ERR(mk_user); } else { key_put(mk_user); err = 0; } up_read(&mk->mk_sem); fscrypt_put_master_key(mk); out: if (err == -ENOKEY && capable(CAP_FOWNER)) err = 0; return err; } /* * Try to evict the inode's dentries from the dentry cache. If the inode is a * directory, then it can have at most one dentry; however, that dentry may be * pinned by child dentries, so first try to evict the children too. */ static void shrink_dcache_inode(struct inode *inode) { struct dentry *dentry; if (S_ISDIR(inode->i_mode)) { dentry = d_find_any_alias(inode); if (dentry) { shrink_dcache_parent(dentry); dput(dentry); } } d_prune_aliases(inode); } static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) { struct fscrypt_inode_info *ci; struct inode *inode; struct inode *toput_inode = NULL; spin_lock(&mk->mk_decrypted_inodes_lock); list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { inode = ci->ci_inode; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&mk->mk_decrypted_inodes_lock); shrink_dcache_inode(inode); iput(toput_inode); toput_inode = inode; spin_lock(&mk->mk_decrypted_inodes_lock); } spin_unlock(&mk->mk_decrypted_inodes_lock); iput(toput_inode); } static int check_for_busy_inodes(struct super_block *sb, struct fscrypt_master_key *mk) { struct list_head *pos; size_t busy_count = 0; unsigned long ino; char ino_str[50] = ""; spin_lock(&mk->mk_decrypted_inodes_lock); list_for_each(pos, &mk->mk_decrypted_inodes) busy_count++; if (busy_count == 0) { spin_unlock(&mk->mk_decrypted_inodes_lock); return 0; } { /* select an example file to show for debugging purposes */ struct inode *inode = list_first_entry(&mk->mk_decrypted_inodes, struct fscrypt_inode_info, ci_master_key_link)->ci_inode; ino = inode->i_ino; } spin_unlock(&mk->mk_decrypted_inodes_lock); /* If the inode is currently being created, ino may still be 0. */ if (ino) snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); fscrypt_warn(NULL, "%s: %zu inode(s) still busy after removing key with %s %*phN%s", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, ino_str); return -EBUSY; } static int try_to_lock_encrypted_files(struct super_block *sb, struct fscrypt_master_key *mk) { int err1; int err2; /* * An inode can't be evicted while it is dirty or has dirty pages. * Thus, we first have to clean the inodes in ->mk_decrypted_inodes. * * Just do it the easy way: call sync_filesystem(). It's overkill, but * it works, and it's more important to minimize the amount of caches we * drop than the amount of data we sync. Also, unprivileged users can * already call sync_filesystem() via sys_syncfs() or sys_sync(). */ down_read(&sb->s_umount); err1 = sync_filesystem(sb); up_read(&sb->s_umount); /* If a sync error occurs, still try to evict as much as possible. */ /* * Inodes are pinned by their dentries, so we have to evict their * dentries. shrink_dcache_sb() would suffice, but would be overkill * and inappropriate for use by unprivileged users. So instead go * through the inodes' alias lists and try to evict each dentry. */ evict_dentries_for_decrypted_inodes(mk); /* * evict_dentries_for_decrypted_inodes() already iput() each inode in * the list; any inodes for which that dropped the last reference will * have been evicted due to fscrypt_drop_inode() detecting the key * removal and telling the VFS to evict the inode. So to finish, we * just need to check whether any inodes couldn't be evicted. */ err2 = check_for_busy_inodes(sb, mk); return err1 ?: err2; } /* * Try to remove an fscrypt master encryption key. * * FS_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's * claim to the key, then removes the key itself if no other users have claims. * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the * key itself. * * To "remove the key itself", first we transition the key to the "incompletely * removed" state, so that no more inodes can be unlocked with it. Then we try * to evict all cached inodes that had been unlocked with the key. * * If all inodes were evicted, then we unlink the fscrypt_master_key from the * keyring. Otherwise it remains in the keyring in the "incompletely removed" * state where it tracks the list of remaining inodes. Userspace can execute * the ioctl again later to retry eviction, or alternatively can re-add the key. * * For more details, see the "Removing keys" section of * Documentation/filesystems/fscrypt.rst. */ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_remove_key_arg __user *uarg = _uarg; struct fscrypt_remove_key_arg arg; struct fscrypt_master_key *mk; u32 status_flags = 0; int err; bool inodes_remain; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; /* * Only root can add and remove keys that are identified by an arbitrary * descriptor rather than by a cryptographic hash. */ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && !capable(CAP_SYS_ADMIN)) return -EACCES; /* Find the key being removed. */ mk = fscrypt_find_master_key(sb, &arg.key_spec); if (!mk) return -ENOKEY; down_write(&mk->mk_sem); /* If relevant, remove current user's (or all users) claim to the key */ if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { if (all_users) err = keyring_clear(mk->mk_users); else err = remove_master_key_user(mk); if (err) { up_write(&mk->mk_sem); goto out_put_key; } if (mk->mk_users->keys.nr_leaves_on_tree != 0) { /* * Other users have still added the key too. We removed * the current user's claim to the key, but we still * can't remove the key itself. */ status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; err = 0; up_write(&mk->mk_sem); goto out_put_key; } } /* No user claims remaining. Initiate removal of the key. */ err = -ENOKEY; if (mk->mk_present) { fscrypt_initiate_key_removal(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; up_write(&mk->mk_sem); if (inodes_remain) { /* Some inodes still reference this key; try to evict them. */ err = try_to_lock_encrypted_files(sb, mk); if (err == -EBUSY) { status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY; err = 0; } } /* * We return 0 if we successfully did something: removed a claim to the * key, initiated removal of the key, or tried locking the files again. * Users need to check the informational status flags if they care * whether the key has been fully removed including all files locked. */ out_put_key: fscrypt_put_master_key(mk); if (err == 0) err = put_user(status_flags, &uarg->removal_status_flags); return err; } int fscrypt_ioctl_remove_key(struct file *filp, void __user *uarg) { return do_remove_key(filp, uarg, false); } EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; return do_remove_key(filp, uarg, true); } EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users); /* * Retrieve the status of an fscrypt master encryption key. * * We set ->status to indicate whether the key is absent, present, or * incompletely removed. (For an explanation of what these statuses mean and * how they are represented internally, see struct fscrypt_master_key.) This * field allows applications to easily determine the status of an encrypted * directory without using a hack such as trying to open a regular file in it * (which can confuse the "incompletely removed" status with absent or present). * * In addition, for v2 policy keys we allow applications to determine, via * ->status_flags and ->user_count, whether the key has been added by the * current user, by other users, or by both. Most applications should not need * this, since ordinarily only one user should know a given key. However, if a * secret key is shared by multiple users, applications may wish to add an * already-present key to prevent other users from removing it. This ioctl can * be used to check whether that really is the case before the work is done to * add the key --- which might e.g. require prompting the user for a passphrase. * * For more details, see the "FS_IOC_GET_ENCRYPTION_KEY_STATUS" section of * Documentation/filesystems/fscrypt.rst. */ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_get_key_status_arg arg; struct fscrypt_master_key *mk; int err; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; arg.status_flags = 0; arg.user_count = 0; memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); mk = fscrypt_find_master_key(sb, &arg.key_spec); if (!mk) { arg.status = FSCRYPT_KEY_STATUS_ABSENT; err = 0; goto out; } down_read(&mk->mk_sem); if (!mk->mk_present) { arg.status = refcount_read(&mk->mk_active_refs) > 0 ? FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; err = 0; goto out_release_key; } arg.status = FSCRYPT_KEY_STATUS_PRESENT; if (mk->mk_users) { struct key *mk_user; arg.user_count = mk->mk_users->keys.nr_leaves_on_tree; mk_user = find_master_key_user(mk); if (!IS_ERR(mk_user)) { arg.status_flags |= FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF; key_put(mk_user); } else if (mk_user != ERR_PTR(-ENOKEY)) { err = PTR_ERR(mk_user); goto out_release_key; } } err = 0; out_release_key: up_read(&mk->mk_sem); fscrypt_put_master_key(mk); out: if (!err && copy_to_user(uarg, &arg, sizeof(arg))) err = -EFAULT; return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_key_status); int __init fscrypt_init_keyring(void) { int err; err = register_key_type(&key_type_fscrypt_user); if (err) return err; err = register_key_type(&key_type_fscrypt_provisioning); if (err) goto err_unregister_fscrypt_user; return 0; err_unregister_fscrypt_user: unregister_key_type(&key_type_fscrypt_user); return err; }
638 638 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TEXT_PATCHING_H #define _ASM_X86_TEXT_PATCHING_H #include <linux/types.h> #include <linux/stddef.h> #include <asm/ptrace.h> /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ #define TEXT_POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. * Side-effect: any interrupt handler running between save and restore will have * the ability to write to read-only pages. * * Warning: * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * On the local CPU you need to be protected against NMI or MCE handlers seeing * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void smp_text_poke_sync_each_cpu(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); #define text_poke_copy text_poke_copy extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int smp_text_poke_int3_handler(struct pt_regs *regs); extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate); extern void smp_text_poke_batch_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC #define RET_INSN_SIZE 1 #define RET_INSN_OPCODE 0xC3 #define CALL_INSN_SIZE 5 #define CALL_INSN_OPCODE 0xE8 #define JMP32_INSN_SIZE 5 #define JMP32_INSN_OPCODE 0xE9 #define JMP8_INSN_SIZE 2 #define JMP8_INSN_OPCODE 0xEB #define DISP32_SIZE 4 static __always_inline int text_opcode_size(u8 opcode) { int size = 0; #define __CASE(insn) \ case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break switch(opcode) { __CASE(INT3); __CASE(RET); __CASE(CALL); __CASE(JMP32); __CASE(JMP8); } #undef __CASE return size; } union text_poke_insn { u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; } __attribute__((packed)); }; static __always_inline void __text_gen_insn(void *buf, u8 opcode, const void *addr, const void *dest, int size) { union text_poke_insn *insn = buf; BUG_ON(size < text_opcode_size(opcode)); /* * Hide the addresses to avoid the compiler folding in constants when * referencing code, these can mess up annotations like * ANNOTATE_NOENDBR. */ OPTIMIZER_HIDE_VAR(insn); OPTIMIZER_HIDE_VAR(addr); OPTIMIZER_HIDE_VAR(dest); insn->opcode = opcode; if (size > 1) { insn->disp = (long)dest - (long)(addr + size); if (size == 2) { /* * Ensure that for JMP8 the displacement * actually fits the signed byte. */ BUG_ON((insn->disp >> 31) != (insn->disp >> 7)); } } } static __always_inline void *text_gen_insn(u8 opcode, const void *addr, const void *dest) { static union text_poke_insn insn; /* per instance */ __text_gen_insn(&insn, opcode, addr, dest, text_opcode_size(opcode)); return &insn.text; } extern int after_bootmem; extern __ro_after_init struct mm_struct *text_poke_mm; extern __ro_after_init unsigned long text_poke_mm_addr; #ifndef CONFIG_UML_X86 static __always_inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) { regs->ip = ip; } static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* * The INT3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of * this gap. See the idtentry macro's X86_TRAP_BP logic. * * Similarly, entry_32.S will have a gap on the stack for * (any) hardware exception and pt_regs; see the * FIXUP_FRAME macro. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; } static __always_inline unsigned long int3_emulate_pop(struct pt_regs *regs) { unsigned long val = *(unsigned long *)regs->sp; regs->sp += sizeof(unsigned long); return val; } static __always_inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) { int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); } static __always_inline void int3_emulate_ret(struct pt_regs *regs) { unsigned long ip = int3_emulate_pop(regs); int3_emulate_jmp(regs, ip); } static __always_inline bool __emulate_cc(unsigned long flags, u8 cc) { static const unsigned long cc_mask[6] = { [0] = X86_EFLAGS_OF, [1] = X86_EFLAGS_CF, [2] = X86_EFLAGS_ZF, [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, [4] = X86_EFLAGS_SF, [5] = X86_EFLAGS_PF, }; bool invert = cc & 1; bool match; if (cc < 0xc) { match = flags & cc_mask[cc >> 1]; } else { match = ((flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ ((flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); if (cc >= 0xe) match = match || (flags & X86_EFLAGS_ZF); } return (match && !invert) || (!match && invert); } static __always_inline void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp) { if (__emulate_cc(regs->flags, cc)) ip += disp; int3_emulate_jmp(regs, ip); } #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */
110 29 114 116 19 19 110 29 117 123 119 123 123 93 118 121 121 118 8 4 4 8 3 5 172 63 63 72 52 52 15 644 644 135 645 43 15 644 295 294 151 95 87 87 74 75 295 10 10 102 103 103 103 103 103 103 102 103 102 102 102 103 103 275 269 30 31 28 29 27 27 11 11 11 11 11 272 273 86 84 72 70 73 73 10 10 10 63 63 49 12 12 12 41 40 54 54 54 54 53 54 53 54 84 6 6 6 6 274 271 271 138 140 133 109 132 111 1 1 1 1 1 1 112 28 26 23 7 103 21 21 82 83 83 7 7 83 81 82 83 83 22 83 267 83 272 286 248 242 242 80 248 247 246 74 74 74 19 70 70 75 271 269 268 269 24 22 23 23 24 24 10 24 24 24 22 22 10 9 9 10 10 10 17 22 22 17 22 22 22 24 61 61 1 59 60 60 47 45 47 46 231 219 22 22 21 22 228 30 30 20 20 10 10 30 3 1 3 13 13 8 2 13 13 2 2 11 11 13 773 32 229 22 230 231 231 231 15 15 14 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/runtime.c - Helper functions for device runtime PM * * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * Copyright (C) 2010 Alan Stern <stern@rowland.harvard.edu> */ #include <linux/sched/mm.h> #include <linux/ktime.h> #include <linux/hrtimer.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <linux/rculist.h> #include <trace/events/rpm.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); static inline pm_callback_t get_callback_ptr(const void *start, size_t offset) { return *(pm_callback_t *)(start + offset); } static pm_callback_t __rpm_get_driver_callback(struct device *dev, size_t cb_offset) { if (dev->driver && dev->driver->pm) return get_callback_ptr(dev->driver->pm, cb_offset); return NULL; } static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset) { const struct dev_pm_ops *ops; pm_callback_t cb = NULL; if (dev->pm_domain) ops = &dev->pm_domain->ops; else if (dev->type && dev->type->pm) ops = dev->type->pm; else if (dev->class && dev->class->pm) ops = dev->class->pm; else if (dev->bus && dev->bus->pm) ops = dev->bus->pm; else ops = NULL; if (ops) cb = get_callback_ptr(ops, cb_offset); if (!cb) cb = __rpm_get_driver_callback(dev, cb_offset); return cb; } #define RPM_GET_CALLBACK(dev, callback) \ __rpm_get_callback(dev, offsetof(struct dev_pm_ops, callback)) static int rpm_resume(struct device *dev, int rpmflags); static int rpm_suspend(struct device *dev, int rpmflags); /** * update_pm_runtime_accounting - Update the time accounting of power states * @dev: Device to update the accounting for * * In order to be able to have time accounting of the various power states * (as used by programs such as PowerTOP to show the effectiveness of runtime * PM), we need to track the time spent in each state. * update_pm_runtime_accounting must be called each time before the * runtime_status field is updated, to account the time in the old state * correctly. */ static void update_pm_runtime_accounting(struct device *dev) { u64 now, last, delta; if (dev->power.disable_depth > 0) return; last = dev->power.accounting_timestamp; now = ktime_get_mono_fast_ns(); dev->power.accounting_timestamp = now; /* * Because ktime_get_mono_fast_ns() is not monotonic during * timekeeping updates, ensure that 'now' is after the last saved * timesptamp. */ if (now < last) return; delta = now - last; if (dev->power.runtime_status == RPM_SUSPENDED) dev->power.suspended_time += delta; else dev->power.active_time += delta; } static void __update_runtime_status(struct device *dev, enum rpm_status status) { update_pm_runtime_accounting(dev); trace_rpm_status(dev, status); dev->power.runtime_status = status; } static u64 rpm_get_accounted_time(struct device *dev, bool suspended) { u64 time; unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); update_pm_runtime_accounting(dev); time = suspended ? dev->power.suspended_time : dev->power.active_time; spin_unlock_irqrestore(&dev->power.lock, flags); return time; } u64 pm_runtime_active_time(struct device *dev) { return rpm_get_accounted_time(dev, false); } u64 pm_runtime_suspended_time(struct device *dev) { return rpm_get_accounted_time(dev, true); } EXPORT_SYMBOL_GPL(pm_runtime_suspended_time); /** * pm_runtime_deactivate_timer - Deactivate given device's suspend timer. * @dev: Device to handle. */ static void pm_runtime_deactivate_timer(struct device *dev) { if (dev->power.timer_expires > 0) { hrtimer_try_to_cancel(&dev->power.suspend_timer); dev->power.timer_expires = 0; } } /** * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests. * @dev: Device to handle. */ static void pm_runtime_cancel_pending(struct device *dev) { pm_runtime_deactivate_timer(dev); /* * In case there's a request pending, make sure its work function will * return without doing anything. */ dev->power.request = RPM_REQ_NONE; } /* * pm_runtime_autosuspend_expiration - Get a device's autosuspend-delay expiration time. * @dev: Device to handle. * * Compute the autosuspend-delay expiration time based on the device's * power.last_busy time. If the delay has already expired or is disabled * (negative) or the power.use_autosuspend flag isn't set, return 0. * Otherwise return the expiration time in nanoseconds (adjusted to be nonzero). * * This function may be called either with or without dev->power.lock held. * Either way it can be racy, since power.last_busy may be updated at any time. */ u64 pm_runtime_autosuspend_expiration(struct device *dev) { int autosuspend_delay; u64 expires; if (!dev->power.use_autosuspend) return 0; autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay); if (autosuspend_delay < 0) return 0; expires = READ_ONCE(dev->power.last_busy); expires += (u64)autosuspend_delay * NSEC_PER_MSEC; if (expires > ktime_get_mono_fast_ns()) return expires; /* Expires in the future */ return 0; } EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); static int dev_memalloc_noio(struct device *dev, void *data) { return dev->power.memalloc_noio; } /* * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag. * @dev: Device to handle. * @enable: True for setting the flag and False for clearing the flag. * * Set the flag for all devices in the path from the device to the * root device in the device tree if @enable is true, otherwise clear * the flag for devices in the path whose siblings don't set the flag. * * The function should only be called by block device, or network * device driver for solving the deadlock problem during runtime * resume/suspend: * * If memory allocation with GFP_KERNEL is called inside runtime * resume/suspend callback of any one of its ancestors(or the * block device itself), the deadlock may be triggered inside the * memory allocation since it might not complete until the block * device becomes active and the involed page I/O finishes. The * situation is pointed out first by Alan Stern. Network device * are involved in iSCSI kind of situation. * * The lock of dev_hotplug_mutex is held in the function for handling * hotplug race because pm_runtime_set_memalloc_noio() may be called * in async probe(). * * The function should be called between device_add() and device_del() * on the affected device(block/network device). */ void pm_runtime_set_memalloc_noio(struct device *dev, bool enable) { static DEFINE_MUTEX(dev_hotplug_mutex); mutex_lock(&dev_hotplug_mutex); for (;;) { bool enabled; /* hold power lock since bitfield is not SMP-safe. */ spin_lock_irq(&dev->power.lock); enabled = dev->power.memalloc_noio; dev->power.memalloc_noio = enable; spin_unlock_irq(&dev->power.lock); /* * not need to enable ancestors any more if the device * has been enabled. */ if (enabled && enable) break; dev = dev->parent; /* * clear flag of the parent device only if all the * children don't set the flag because ancestor's * flag was set by any one of the descendants. */ if (!dev || (!enable && device_for_each_child(dev, NULL, dev_memalloc_noio))) break; } mutex_unlock(&dev_hotplug_mutex); } EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio); /** * rpm_check_suspend_allowed - Test whether a device may be suspended. * @dev: Device to test. */ static int rpm_check_suspend_allowed(struct device *dev) { int retval = 0; if (dev->power.runtime_error) retval = -EINVAL; else if (dev->power.disable_depth > 0) retval = -EACCES; else if (atomic_read(&dev->power.usage_count)) retval = -EAGAIN; else if (!dev->power.ignore_children && atomic_read(&dev->power.child_count)) retval = -EBUSY; /* Pending resume requests take precedence over suspends. */ else if ((dev->power.deferred_resume && dev->power.runtime_status == RPM_SUSPENDING) || (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME)) retval = -EAGAIN; else if (__dev_pm_qos_resume_latency(dev) == 0) retval = -EPERM; else if (dev->power.runtime_status == RPM_SUSPENDED) retval = 1; return retval; } static int rpm_get_suppliers(struct device *dev) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { int retval; if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) continue; retval = pm_runtime_get_sync(link->supplier); /* Ignore suppliers with disabled runtime PM. */ if (retval < 0 && retval != -EACCES) { pm_runtime_put_noidle(link->supplier); return retval; } refcount_inc(&link->rpm_active); } return 0; } /** * pm_runtime_release_supplier - Drop references to device link's supplier. * @link: Target device link. * * Drop all runtime PM references associated with @link to its supplier device. */ void pm_runtime_release_supplier(struct device_link *link) { struct device *supplier = link->supplier; /* * The additional power.usage_count check is a safety net in case * the rpm_active refcount becomes saturated, in which case * refcount_dec_not_one() would return true forever, but it is not * strictly necessary. */ while (refcount_dec_not_one(&link->rpm_active) && atomic_read(&supplier->power.usage_count) > 0) pm_runtime_put_noidle(supplier); } static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { pm_runtime_release_supplier(link); if (try_to_suspend) pm_request_idle(link->supplier); } } static void rpm_put_suppliers(struct device *dev) { __rpm_put_suppliers(dev, true); } static void rpm_suspend_suppliers(struct device *dev) { struct device_link *link; int idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) pm_request_idle(link->supplier); device_links_read_unlock(idx); } /** * __rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int __rpm_callback(int (*cb)(struct device *), struct device *dev) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int retval = 0, idx; bool use_links = dev->power.links_count > 0; if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); } else { spin_unlock_irq(&dev->power.lock); /* * Resume suppliers if necessary. * * The device's runtime PM status cannot change until this * routine returns, so it is safe to read the status outside of * the lock. */ if (use_links && dev->power.runtime_status == RPM_RESUMING) { idx = device_links_read_lock(); retval = rpm_get_suppliers(dev); if (retval) { rpm_put_suppliers(dev); goto fail; } device_links_read_unlock(idx); } } if (cb) retval = cb(dev); if (dev->power.irq_safe) { spin_lock(&dev->power.lock); } else { /* * If the device is suspending and the callback has returned * success, drop the usage counters of the suppliers that have * been reference counted on its resume. * * Do that if resume fails too. */ if (use_links && ((dev->power.runtime_status == RPM_SUSPENDING && !retval) || (dev->power.runtime_status == RPM_RESUMING && retval))) { idx = device_links_read_lock(); __rpm_put_suppliers(dev, false); fail: device_links_read_unlock(idx); } spin_lock_irq(&dev->power.lock); } return retval; } /** * rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int rpm_callback(int (*cb)(struct device *), struct device *dev) { int retval; if (dev->power.memalloc_noio) { unsigned int noio_flag; /* * Deadlock might be caused if memory allocation with * GFP_KERNEL happens inside runtime_suspend and * runtime_resume callbacks of one block device's * ancestor or the block device itself. Network * device might be thought as part of iSCSI block * device, so network device and its ancestor should * be marked as memalloc_noio too. */ noio_flag = memalloc_noio_save(); retval = __rpm_callback(cb, dev); memalloc_noio_restore(noio_flag); } else { retval = __rpm_callback(cb, dev); } /* * Since -EACCES means that runtime PM is disabled for the given device, * it should not be returned by runtime PM callbacks. If it is returned * nevertheless, assume it to be a transient error and convert it to * -EAGAIN. */ if (retval == -EACCES) retval = -EAGAIN; if (retval != -EAGAIN && retval != -EBUSY) dev->power.runtime_error = retval; return retval; } /** * rpm_idle - Notify device bus type if the device can be suspended. * @dev: Device to notify the bus type about. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. If * another idle notification has been started earlier, return immediately. If * the RPM_ASYNC flag is set then queue an idle-notification request; otherwise * run the ->runtime_idle() callback directly. If the ->runtime_idle callback * doesn't exist or if it returns 0, call rpm_suspend with the RPM_AUTO flag. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_idle(struct device *dev, int rpmflags) { int (*callback)(struct device *); int retval; trace_rpm_idle(dev, rpmflags); retval = rpm_check_suspend_allowed(dev); if (retval < 0) ; /* Conditions are wrong. */ else if ((rpmflags & RPM_GET_PUT) && retval == 1) ; /* put() is allowed in RPM_SUSPENDED */ /* Idle notifications are allowed only in the RPM_ACTIVE state. */ else if (dev->power.runtime_status != RPM_ACTIVE) retval = -EAGAIN; /* * Any pending request other than an idle notification takes * precedence over us, except that the timer may be running. */ else if (dev->power.request_pending && dev->power.request > RPM_REQ_IDLE) retval = -EAGAIN; /* Act as though RPM_NOWAIT is always set. */ else if (dev->power.idle_notification) retval = -EINPROGRESS; if (retval) goto out; /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; callback = RPM_GET_CALLBACK(dev, runtime_idle); /* If no callback assume success. */ if (!callback || dev->power.no_callbacks) goto out; /* Carry out an asynchronous or a synchronous idle notification. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_IDLE; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } trace_rpm_return_int(dev, _THIS_IP_, 0); return 0; } dev->power.idle_notification = true; if (dev->power.irq_safe) spin_unlock(&dev->power.lock); else spin_unlock_irq(&dev->power.lock); retval = callback(dev); if (dev->power.irq_safe) spin_lock(&dev->power.lock); else spin_lock_irq(&dev->power.lock); dev->power.idle_notification = false; wake_up_all(&dev->power.wait_queue); out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval ? retval : rpm_suspend(dev, rpmflags | RPM_AUTO); } /** * rpm_suspend - Carry out runtime suspend of given device. * @dev: Device to suspend. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. * Cancel a pending idle notification, autosuspend or suspend. If * another suspend has been started earlier, either return immediately * or wait for it to finish, depending on the RPM_NOWAIT and RPM_ASYNC * flags. If the RPM_ASYNC flag is set then queue a suspend request; * otherwise run the ->runtime_suspend() callback directly. When * ->runtime_suspend succeeded, if a deferred resume was requested while * the callback was running then carry it out, otherwise send an idle * notification for its parent (if the suspend succeeded and both * ignore_children of parent->power and irq_safe of dev->power are not set). * If ->runtime_suspend failed with -EAGAIN or -EBUSY, and if the RPM_AUTO * flag is set and the next autosuspend-delay expiration time is in the * future, schedule another autosuspend attempt. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_suspend(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval; trace_rpm_suspend(dev, rpmflags); repeat: retval = rpm_check_suspend_allowed(dev); if (retval < 0) goto out; /* Conditions are wrong. */ /* Synchronous suspends are not allowed in the RPM_RESUMING state. */ if (dev->power.runtime_status == RPM_RESUMING && !(rpmflags & RPM_ASYNC)) retval = -EAGAIN; if (retval) goto out; /* If the autosuspend_delay time hasn't expired yet, reschedule. */ if ((rpmflags & RPM_AUTO) && dev->power.runtime_status != RPM_SUSPENDING) { u64 expires = pm_runtime_autosuspend_expiration(dev); if (expires != 0) { /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; /* * Optimization: If the timer is already running and is * set to expire at or before the autosuspend delay, * avoid the overhead of resetting it. Just let it * expire; pm_suspend_timer_fn() will take care of the * rest. */ if (!(dev->power.timer_expires && dev->power.timer_expires <= expires)) { /* * We add a slack of 25% to gather wakeups * without sacrificing the granularity. */ u64 slack = (u64)READ_ONCE(dev->power.autosuspend_delay) * (NSEC_PER_MSEC >> 2); dev->power.timer_expires = expires; hrtimer_start_range_ns(&dev->power.suspend_timer, ns_to_ktime(expires), slack, HRTIMER_MODE_ABS); } dev->power.timer_autosuspends = 1; goto out; } } /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); if (dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { retval = -EINPROGRESS; goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the other suspend running in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ /* Carry out an asynchronous or a synchronous suspend. */ if (rpmflags & RPM_ASYNC) { dev->power.request = (rpmflags & RPM_AUTO) ? RPM_REQ_AUTOSUSPEND : RPM_REQ_SUSPEND; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } goto out; } __update_runtime_status(dev, RPM_SUSPENDING); callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); retval = rpm_callback(callback, dev); if (retval) goto fail; dev_pm_enable_wake_irq_complete(dev); no_callback: __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev); if (dev->parent) { parent = dev->parent; atomic_add_unless(&parent->power.child_count, -1, 0); } wake_up_all(&dev->power.wait_queue); if (dev->power.deferred_resume) { dev->power.deferred_resume = false; rpm_resume(dev, 0); retval = -EAGAIN; goto out; } if (dev->power.irq_safe) goto out; /* Maybe the parent is now able to suspend. */ if (parent && !parent->power.ignore_children) { spin_unlock(&dev->power.lock); spin_lock(&parent->power.lock); rpm_idle(parent, RPM_ASYNC); spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); } /* Maybe the suppliers are now able to suspend. */ if (dev->power.links_count > 0) { spin_unlock_irq(&dev->power.lock); rpm_suspend_suppliers(dev); spin_lock_irq(&dev->power.lock); } out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; fail: dev_pm_disable_wake_irq_check(dev, true); __update_runtime_status(dev, RPM_ACTIVE); dev->power.deferred_resume = false; wake_up_all(&dev->power.wait_queue); /* * On transient errors, if the callback routine failed an autosuspend, * and if the last_busy time has been updated so that there is a new * autosuspend expiration time, automatically reschedule another * autosuspend. */ if (!dev->power.runtime_error && (rpmflags & RPM_AUTO) && pm_runtime_autosuspend_expiration(dev) != 0) goto repeat; pm_runtime_cancel_pending(dev); goto out; } /** * rpm_resume - Carry out runtime resume of given device. * @dev: Device to resume. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be resumed. Cancel * any scheduled or pending requests. If another resume has been started * earlier, either return immediately or wait for it to finish, depending on the * RPM_NOWAIT and RPM_ASYNC flags. Similarly, if there's a suspend running in * parallel with this function, either tell the other process to resume after * suspending (deferred_resume) or wait for it to finish. If the RPM_ASYNC * flag is set then queue a resume request; otherwise run the * ->runtime_resume() callback directly. Queue an idle notification for the * device if the resume succeeded. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_resume(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval = 0; trace_rpm_resume(dev, rpmflags); repeat: if (dev->power.runtime_error) { retval = -EINVAL; } else if (dev->power.disable_depth > 0) { if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; else if (rpmflags & RPM_TRANSPARENT) goto out; else retval = -EACCES; } if (retval) goto out; /* * Other scheduled or pending requests need to be canceled. Small * optimization: If an autosuspend timer is running, leave it running * rather than cancelling it now only to restart it again in the near * future. */ dev->power.request = RPM_REQ_NONE; if (!dev->power.timer_autosuspends) pm_runtime_deactivate_timer(dev); if (dev->power.runtime_status == RPM_ACTIVE) { retval = 1; goto out; } if (dev->power.runtime_status == RPM_RESUMING || dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { if (dev->power.runtime_status == RPM_SUSPENDING) { dev->power.deferred_resume = true; if (rpmflags & RPM_NOWAIT) retval = -EINPROGRESS; } else { retval = -EINPROGRESS; } goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the operation carried out in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_RESUMING && dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } /* * See if we can skip waking up the parent. This is safe only if * power.no_callbacks is set, because otherwise we don't know whether * the resume will actually succeed. */ if (dev->power.no_callbacks && !parent && dev->parent) { spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING); if (dev->parent->power.disable_depth > 0 || dev->parent->power.ignore_children || dev->parent->power.runtime_status == RPM_ACTIVE) { atomic_inc(&dev->parent->power.child_count); spin_unlock(&dev->parent->power.lock); retval = 1; goto no_callback; /* Assume success. */ } spin_unlock(&dev->parent->power.lock); } /* Carry out an asynchronous or a synchronous resume. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_RESUME; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } retval = 0; goto out; } if (!parent && dev->parent) { /* * Increment the parent's usage counter and resume it if * necessary. Not needed if dev is irq-safe; then the * parent is permanently resumed. */ parent = dev->parent; if (dev->power.irq_safe) goto skip_parent; spin_unlock(&dev->power.lock); pm_runtime_get_noresume(parent); spin_lock(&parent->power.lock); /* * Resume the parent if it has runtime PM enabled and not been * set to ignore its children. */ if (!parent->power.disable_depth && !parent->power.ignore_children) { rpm_resume(parent, 0); if (parent->power.runtime_status != RPM_ACTIVE) retval = -EBUSY; } spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); if (retval) goto out; goto repeat; } skip_parent: if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ __update_runtime_status(dev, RPM_RESUMING); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); dev_pm_enable_wake_irq_check(dev, false); } else { no_callback: __update_runtime_status(dev, RPM_ACTIVE); pm_runtime_mark_last_busy(dev); if (parent) atomic_inc(&parent->power.child_count); } wake_up_all(&dev->power.wait_queue); if (retval >= 0) rpm_idle(dev, RPM_ASYNC); out: if (parent && !dev->power.irq_safe) { spin_unlock_irq(&dev->power.lock); pm_runtime_put(parent); spin_lock_irq(&dev->power.lock); } trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; } /** * pm_runtime_work - Universal runtime PM work function. * @work: Work structure used for scheduling the execution of this function. * * Use @work to get the device object the work is to be done for, determine what * is to be done and execute the appropriate runtime PM function. */ static void pm_runtime_work(struct work_struct *work) { struct device *dev = container_of(work, struct device, power.work); enum rpm_request req; spin_lock_irq(&dev->power.lock); if (!dev->power.request_pending) goto out; req = dev->power.request; dev->power.request = RPM_REQ_NONE; dev->power.request_pending = false; switch (req) { case RPM_REQ_NONE: break; case RPM_REQ_IDLE: rpm_idle(dev, RPM_NOWAIT); break; case RPM_REQ_SUSPEND: rpm_suspend(dev, RPM_NOWAIT); break; case RPM_REQ_AUTOSUSPEND: rpm_suspend(dev, RPM_NOWAIT | RPM_AUTO); break; case RPM_REQ_RESUME: rpm_resume(dev, RPM_NOWAIT); break; } out: spin_unlock_irq(&dev->power.lock); } /** * pm_suspend_timer_fn - Timer function for pm_schedule_suspend(). * @timer: hrtimer used by pm_schedule_suspend(). * * Check if the time is right and queue a suspend request. */ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer) { struct device *dev = container_of(timer, struct device, power.suspend_timer); unsigned long flags; u64 expires; spin_lock_irqsave(&dev->power.lock, flags); expires = dev->power.timer_expires; /* * If 'expires' is after the current time, we've been called * too early. */ if (expires > 0 && expires <= ktime_get_mono_fast_ns()) { dev->power.timer_expires = 0; rpm_suspend(dev, dev->power.timer_autosuspends ? (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC); } spin_unlock_irqrestore(&dev->power.lock, flags); return HRTIMER_NORESTART; } /** * pm_schedule_suspend - Set up a timer to submit a suspend request in future. * @dev: Device to suspend. * @delay: Time to wait before submitting a suspend request, in milliseconds. */ int pm_schedule_suspend(struct device *dev, unsigned int delay) { unsigned long flags; u64 expires; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (!delay) { retval = rpm_suspend(dev, RPM_ASYNC); goto out; } retval = rpm_check_suspend_allowed(dev); if (retval) goto out; /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); expires = ktime_get_mono_fast_ns() + (u64)delay * NSEC_PER_MSEC; dev->power.timer_expires = expires; dev->power.timer_autosuspends = 0; hrtimer_start(&dev->power.suspend_timer, expires, HRTIMER_MODE_ABS); out: spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(pm_schedule_suspend); static int rpm_drop_usage_count(struct device *dev) { int ret; ret = atomic_sub_return(1, &dev->power.usage_count); if (ret >= 0) return ret; /* * Because rpm_resume() does not check the usage counter, it will resume * the device even if the usage counter is 0 or negative, so it is * sufficient to increment the usage counter here to reverse the change * made above. */ atomic_inc(&dev->power.usage_count); dev_warn(dev, "Runtime PM usage count underflow!\n"); return -EINVAL; } /** * __pm_runtime_idle - Entry point for runtime idle operations. * @dev: Device to send idle notification for. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out an idle * notification, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_idle(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_idle(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_idle); /** * __pm_runtime_suspend - Entry point for runtime put/suspend operations. * @dev: Device to suspend. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out a suspend, * either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_suspend(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_suspend(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_suspend); /** * __pm_runtime_resume - Entry point for runtime resume operations. * @dev: Device to resume. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, increment the device's usage count. Then * carry out a resume, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_resume(struct device *dev, int rpmflags) { unsigned long flags; int retval; might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe && dev->power.runtime_status != RPM_ACTIVE); if (rpmflags & RPM_GET_PUT) atomic_inc(&dev->power.usage_count); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_resume(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_resume); /** * pm_runtime_get_conditional - Conditionally bump up device usage counter. * @dev: Device to handle. * @ign_usage_count: Whether or not to look at the current usage counter value. * * Return -EINVAL if runtime PM is disabled for @dev. * * Otherwise, if its runtime PM status is %RPM_ACTIVE and (1) @ign_usage_count * is set, or (2) @dev is not ignoring children and its active child count is * nonero, or (3) the runtime PM usage counter of @dev is not zero, increment * the usage counter of @dev and return 1. * * Otherwise, return 0 without changing the usage counter. * * If @ign_usage_count is %true, this function can be used to prevent suspending * the device when its runtime PM status is %RPM_ACTIVE. * * If @ign_usage_count is %false, this function can be used to prevent * suspending the device when both its runtime PM status is %RPM_ACTIVE and its * runtime PM usage counter is not zero. * * The caller is responsible for decrementing the runtime PM usage counter of * @dev after this function has returned a positive value for it. */ static int pm_runtime_get_conditional(struct device *dev, bool ign_usage_count) { unsigned long flags; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.disable_depth > 0) { retval = -EINVAL; } else if (dev->power.runtime_status != RPM_ACTIVE) { retval = 0; } else if (ign_usage_count || (!dev->power.ignore_children && atomic_read(&dev->power.child_count) > 0)) { retval = 1; atomic_inc(&dev->power.usage_count); } else { retval = atomic_inc_not_zero(&dev->power.usage_count); } trace_rpm_usage(dev, 0); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } /** * pm_runtime_get_if_active - Bump up runtime PM usage counter if the device is * in active state * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE, in which case it returns 1. If the device is in a different * state, 0 is returned. -EINVAL is returned if runtime PM is disabled for the * device, in which case also the usage_count will remain unmodified. */ int pm_runtime_get_if_active(struct device *dev) { return pm_runtime_get_conditional(dev, true); } EXPORT_SYMBOL_GPL(pm_runtime_get_if_active); /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE and its runtime PM usage counter is greater than 0 or it is not * ignoring children and its active child count is nonzero. 1 is returned in * this case. * * If @dev is in a different state or it is not in use (that is, its usage * counter is 0, or it is ignoring children, or its active child count is 0), * 0 is returned. * * -EINVAL is returned if runtime PM is disabled for the device, in which case * also the usage counter of @dev is not updated. */ int pm_runtime_get_if_in_use(struct device *dev) { return pm_runtime_get_conditional(dev, false); } EXPORT_SYMBOL_GPL(pm_runtime_get_if_in_use); /** * __pm_runtime_set_status - Set runtime PM status of a device. * @dev: Device to handle. * @status: New runtime PM status of the device. * * If runtime PM of the device is disabled or its power.runtime_error field is * different from zero, the status may be changed either to RPM_ACTIVE, or to * RPM_SUSPENDED, as long as that reflects the actual state of the device. * However, if the device has a parent and the parent is not active, and the * parent's power.ignore_children flag is unset, the device's status cannot be * set to RPM_ACTIVE, so -EBUSY is returned in that case. * * If successful, __pm_runtime_set_status() clears the power.runtime_error field * and the device parent's counter of unsuspended children is modified to * reflect the new status. If the new status is RPM_SUSPENDED, an idle * notification request for the parent is submitted. * * If @dev has any suppliers (as reflected by device links to them), and @status * is RPM_ACTIVE, they will be activated upfront and if the activation of one * of them fails, the status of @dev will be changed to RPM_SUSPENDED (instead * of the @status value) and the suppliers will be deacticated on exit. The * error returned by the failing supplier activation will be returned in that * case. */ int __pm_runtime_set_status(struct device *dev, unsigned int status) { struct device *parent = dev->parent; bool notify_parent = false; unsigned long flags; int error = 0; if (status != RPM_ACTIVE && status != RPM_SUSPENDED) return -EINVAL; spin_lock_irqsave(&dev->power.lock, flags); /* * Prevent PM-runtime from being enabled for the device or return an * error if it is enabled already and working. */ if (dev->power.runtime_error || dev->power.disable_depth) dev->power.disable_depth++; else error = -EAGAIN; spin_unlock_irqrestore(&dev->power.lock, flags); if (error) return error; /* * If the new status is RPM_ACTIVE, the suppliers can be activated * upfront regardless of the current status, because next time * rpm_put_suppliers() runs, the rpm_active refcounts of the links * involved will be dropped down to one anyway. */ if (status == RPM_ACTIVE) { int idx = device_links_read_lock(); error = rpm_get_suppliers(dev); if (error) status = RPM_SUSPENDED; device_links_read_unlock(idx); } spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.runtime_status == status || !parent) goto out_set; if (status == RPM_SUSPENDED) { atomic_add_unless(&parent->power.child_count, -1, 0); notify_parent = !parent->power.ignore_children; } else { spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING); /* * It is invalid to put an active child under a parent that is * not active, has runtime PM enabled and the * 'power.ignore_children' flag unset. */ if (!parent->power.disable_depth && !parent->power.ignore_children && parent->power.runtime_status != RPM_ACTIVE) { dev_err(dev, "runtime PM trying to activate child device %s but parent (%s) is not active\n", dev_name(dev), dev_name(parent)); error = -EBUSY; } else if (dev->power.runtime_status == RPM_SUSPENDED) { atomic_inc(&parent->power.child_count); } spin_unlock(&parent->power.lock); if (error) { status = RPM_SUSPENDED; goto out; } } out_set: __update_runtime_status(dev, status); if (!error) dev->power.runtime_error = 0; out: spin_unlock_irqrestore(&dev->power.lock, flags); if (notify_parent) pm_request_idle(parent); if (status == RPM_SUSPENDED) { int idx = device_links_read_lock(); rpm_put_suppliers(dev); device_links_read_unlock(idx); } pm_runtime_enable(dev); return error; } EXPORT_SYMBOL_GPL(__pm_runtime_set_status); /** * __pm_runtime_barrier - Cancel pending requests and wait for completions. * @dev: Device to handle. * * Flush all pending requests for the device from pm_wq and wait for all * runtime PM operations involving the device in progress to complete. * * Should be called under dev->power.lock with interrupts disabled. */ static void __pm_runtime_barrier(struct device *dev) { pm_runtime_deactivate_timer(dev); if (dev->power.request_pending) { dev->power.request = RPM_REQ_NONE; spin_unlock_irq(&dev->power.lock); cancel_work_sync(&dev->power.work); spin_lock_irq(&dev->power.lock); dev->power.request_pending = false; } if (dev->power.runtime_status == RPM_SUSPENDING || dev->power.runtime_status == RPM_RESUMING || dev->power.idle_notification) { DEFINE_WAIT(wait); /* Suspend, wake-up or idle notification in progress. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING && dev->power.runtime_status != RPM_RESUMING && !dev->power.idle_notification) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); } } /** * pm_runtime_barrier - Flush pending requests and wait for completions. * @dev: Device to handle. * * Prevent the device from being suspended by incrementing its usage counter and * if there's a pending resume request for the device, wake the device up. * Next, make sure that all pending requests for the device have been flushed * from pm_wq and wait for all runtime PM operations involving the device in * progress to complete. * * Return value: * 1, if there was a resume request pending and the device had to be woken up, * 0, otherwise */ int pm_runtime_barrier(struct device *dev) { int retval = 0; pm_runtime_get_noresume(dev); spin_lock_irq(&dev->power.lock); if (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { rpm_resume(dev, 0); retval = 1; } __pm_runtime_barrier(dev); spin_unlock_irq(&dev->power.lock); pm_runtime_put_noidle(dev); return retval; } EXPORT_SYMBOL_GPL(pm_runtime_barrier); bool pm_runtime_block_if_disabled(struct device *dev) { bool ret; spin_lock_irq(&dev->power.lock); ret = !pm_runtime_enabled(dev); if (ret && dev->power.last_status == RPM_INVALID) dev->power.last_status = RPM_BLOCKED; spin_unlock_irq(&dev->power.lock); return ret; } void pm_runtime_unblock(struct device *dev) { spin_lock_irq(&dev->power.lock); if (dev->power.last_status == RPM_BLOCKED) dev->power.last_status = RPM_INVALID; spin_unlock_irq(&dev->power.lock); } void __pm_runtime_disable(struct device *dev, bool check_resume) { spin_lock_irq(&dev->power.lock); if (dev->power.disable_depth > 0) { dev->power.disable_depth++; goto out; } /* * Wake up the device if there's a resume request pending, because that * means there probably is some I/O to process and disabling runtime PM * shouldn't prevent the device from processing the I/O. */ if (check_resume && dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { /* * Prevent suspends and idle notifications from being carried * out after we have woken up the device. */ pm_runtime_get_noresume(dev); rpm_resume(dev, 0); pm_runtime_put_noidle(dev); } /* Update time accounting before disabling PM-runtime. */ update_pm_runtime_accounting(dev); if (!dev->power.disable_depth++) { __pm_runtime_barrier(dev); dev->power.last_status = dev->power.runtime_status; } out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_disable); /** * pm_runtime_enable - Enable runtime PM of a device. * @dev: Device to handle. */ void pm_runtime_enable(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); if (!dev->power.disable_depth) { dev_warn(dev, "Unbalanced %s!\n", __func__); goto out; } if (--dev->power.disable_depth > 0) goto out; if (dev->power.last_status == RPM_BLOCKED) { dev_warn(dev, "Attempt to enable runtime PM when it is blocked\n"); dump_stack(); } dev->power.last_status = RPM_INVALID; dev->power.accounting_timestamp = ktime_get_mono_fast_ns(); if (dev->power.runtime_status == RPM_SUSPENDED && !dev->power.ignore_children && atomic_read(&dev->power.child_count) > 0) dev_warn(dev, "Enabling runtime PM for inactive device with active children\n"); out: spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_runtime_enable); static void pm_runtime_set_suspended_action(void *data) { pm_runtime_set_suspended(data); } /** * devm_pm_runtime_set_active_enabled - set_active version of devm_pm_runtime_enable. * * @dev: Device to handle. */ int devm_pm_runtime_set_active_enabled(struct device *dev) { int err; err = pm_runtime_set_active(dev); if (err) return err; err = devm_add_action_or_reset(dev, pm_runtime_set_suspended_action, dev); if (err) return err; return devm_pm_runtime_enable(dev); } EXPORT_SYMBOL_GPL(devm_pm_runtime_set_active_enabled); static void pm_runtime_disable_action(void *data) { pm_runtime_dont_use_autosuspend(data); pm_runtime_disable(data); } /** * devm_pm_runtime_enable - devres-enabled version of pm_runtime_enable. * * NOTE: this will also handle calling pm_runtime_dont_use_autosuspend() for * you at driver exit time if needed. * * @dev: Device to handle. */ int devm_pm_runtime_enable(struct device *dev) { pm_runtime_enable(dev); return devm_add_action_or_reset(dev, pm_runtime_disable_action, dev); } EXPORT_SYMBOL_GPL(devm_pm_runtime_enable); static void pm_runtime_put_noidle_action(void *data) { pm_runtime_put_noidle(data); } /** * devm_pm_runtime_get_noresume - devres-enabled version of pm_runtime_get_noresume. * * @dev: Device to handle. */ int devm_pm_runtime_get_noresume(struct device *dev) { pm_runtime_get_noresume(dev); return devm_add_action_or_reset(dev, pm_runtime_put_noidle_action, dev); } EXPORT_SYMBOL_GPL(devm_pm_runtime_get_noresume); /** * pm_runtime_forbid - Block runtime PM of a device. * @dev: Device to handle. * * Increase the device's usage count and clear its power.runtime_auto flag, * so that it cannot be suspended at run time until pm_runtime_allow() is called * for it. */ void pm_runtime_forbid(struct device *dev) { spin_lock_irq(&dev->power.lock); if (!dev->power.runtime_auto) goto out; dev->power.runtime_auto = false; atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_forbid); /** * pm_runtime_allow - Unblock runtime PM of a device. * @dev: Device to handle. * * Decrease the device's usage count and set its power.runtime_auto flag. */ void pm_runtime_allow(struct device *dev) { int ret; spin_lock_irq(&dev->power.lock); if (dev->power.runtime_auto) goto out; dev->power.runtime_auto = true; ret = rpm_drop_usage_count(dev); if (ret == 0) rpm_idle(dev, RPM_AUTO | RPM_ASYNC); else if (ret > 0) trace_rpm_usage(dev, RPM_AUTO | RPM_ASYNC); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_allow); /** * pm_runtime_no_callbacks - Ignore runtime PM callbacks for a device. * @dev: Device to handle. * * Set the power.no_callbacks flag, which tells the PM core that this * device is power-managed through its parent and has no runtime PM * callbacks of its own. The runtime sysfs attributes will be removed. */ void pm_runtime_no_callbacks(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.no_callbacks = 1; spin_unlock_irq(&dev->power.lock); if (device_is_registered(dev)) rpm_sysfs_remove(dev); } EXPORT_SYMBOL_GPL(pm_runtime_no_callbacks); /** * pm_runtime_irq_safe - Leave interrupts disabled during callbacks. * @dev: Device to handle * * Set the power.irq_safe flag, which tells the PM core that the * ->runtime_suspend() and ->runtime_resume() callbacks for this device should * always be invoked with the spinlock held and interrupts disabled. It also * causes the parent's usage counter to be permanently incremented, preventing * the parent from runtime suspending -- otherwise an irq-safe child might have * to wait for a non-irq-safe parent. */ void pm_runtime_irq_safe(struct device *dev) { if (dev->parent) pm_runtime_get_sync(dev->parent); spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 1; spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_irq_safe); /** * update_autosuspend - Handle a change to a device's autosuspend settings. * @dev: Device to handle. * @old_delay: The former autosuspend_delay value. * @old_use: The former use_autosuspend value. * * Prevent runtime suspend if the new delay is negative and use_autosuspend is * set; otherwise allow it. Send an idle notification if suspends are allowed. * * This function must be called under dev->power.lock with interrupts disabled. */ static void update_autosuspend(struct device *dev, int old_delay, int old_use) { int delay = dev->power.autosuspend_delay; /* Should runtime suspend be prevented now? */ if (dev->power.use_autosuspend && delay < 0) { /* If it used to be allowed then prevent it. */ if (!old_use || old_delay >= 0) { atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); } else { trace_rpm_usage(dev, 0); } } /* Runtime suspend should be allowed now. */ else { /* If it used to be prevented then allow it. */ if (old_use && old_delay < 0) atomic_dec(&dev->power.usage_count); /* Maybe we can autosuspend now. */ rpm_idle(dev, RPM_AUTO); } } /** * pm_runtime_set_autosuspend_delay - Set a device's autosuspend_delay value. * @dev: Device to handle. * @delay: Value of the new delay in milliseconds. * * Set the device's power.autosuspend_delay value. If it changes to negative * and the power.use_autosuspend flag is set, prevent runtime suspends. If it * changes the other way, allow runtime suspends. */ void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.autosuspend_delay = delay; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_set_autosuspend_delay); /** * __pm_runtime_use_autosuspend - Set a device's use_autosuspend flag. * @dev: Device to handle. * @use: New value for use_autosuspend. * * Set the device's power.use_autosuspend flag, and allow or prevent runtime * suspends as needed. */ void __pm_runtime_use_autosuspend(struct device *dev, bool use) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.use_autosuspend = use; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend); /** * pm_runtime_init - Initialize runtime PM fields in given device object. * @dev: Device object to initialize. */ void pm_runtime_init(struct device *dev) { dev->power.runtime_status = RPM_SUSPENDED; dev->power.last_status = RPM_INVALID; dev->power.idle_notification = false; dev->power.disable_depth = 1; atomic_set(&dev->power.usage_count, 0); dev->power.runtime_error = 0; atomic_set(&dev->power.child_count, 0); pm_suspend_ignore_children(dev, false); dev->power.runtime_auto = true; dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; dev->power.needs_force_resume = false; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; hrtimer_setup(&dev->power.suspend_timer, pm_suspend_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); init_waitqueue_head(&dev->power.wait_queue); } /** * pm_runtime_reinit - Re-initialize runtime PM fields in given device object. * @dev: Device object to re-initialize. */ void pm_runtime_reinit(struct device *dev) { if (!pm_runtime_enabled(dev)) { if (dev->power.runtime_status == RPM_ACTIVE) pm_runtime_set_suspended(dev); if (dev->power.irq_safe) { spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 0; spin_unlock_irq(&dev->power.lock); if (dev->parent) pm_runtime_put(dev->parent); } } /* * Clear power.needs_force_resume in case it has been set by * pm_runtime_force_suspend() invoked from a driver remove callback. */ dev->power.needs_force_resume = false; } /** * pm_runtime_remove - Prepare for removing a device from device hierarchy. * @dev: Device object being removed from device hierarchy. */ void pm_runtime_remove(struct device *dev) { __pm_runtime_disable(dev, false); pm_runtime_reinit(dev); } /** * pm_runtime_get_suppliers - Resume and reference-count supplier devices. * @dev: Consumer device. */ void pm_runtime_get_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) if (device_link_test(link, DL_FLAG_PM_RUNTIME)) { link->supplier_preactivated = true; pm_runtime_get_sync(link->supplier); } device_links_read_unlock(idx); } /** * pm_runtime_put_suppliers - Drop references to supplier devices. * @dev: Consumer device. */ void pm_runtime_put_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->supplier_preactivated) { link->supplier_preactivated = false; pm_runtime_put(link->supplier); } device_links_read_unlock(idx); } void pm_runtime_new_link(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.links_count++; spin_unlock_irq(&dev->power.lock); } static void pm_runtime_drop_link_count(struct device *dev) { spin_lock_irq(&dev->power.lock); WARN_ON(dev->power.links_count == 0); dev->power.links_count--; spin_unlock_irq(&dev->power.lock); } /** * pm_runtime_drop_link - Prepare for device link removal. * @link: Device link going away. * * Drop the link count of the consumer end of @link and decrement the supplier * device's runtime PM usage counter as many times as needed to drop all of the * PM runtime reference to it from the consumer. */ void pm_runtime_drop_link(struct device_link *link) { if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) return; pm_runtime_drop_link_count(link->consumer); pm_runtime_release_supplier(link); pm_request_idle(link->supplier); } static pm_callback_t get_callback(struct device *dev, size_t cb_offset) { /* * Setting power.strict_midlayer means that the middle layer * code does not want its runtime PM callbacks to be invoked via * pm_runtime_force_suspend() and pm_runtime_force_resume(), so * return a direct pointer to the driver callback in that case. */ if (dev_pm_strict_midlayer_is_set(dev)) return __rpm_get_driver_callback(dev, cb_offset); return __rpm_get_callback(dev, cb_offset); } #define GET_CALLBACK(dev, callback) \ get_callback(dev, offsetof(struct dev_pm_ops, callback)) /** * pm_runtime_force_suspend - Force a device into suspend state if needed. * @dev: Device to suspend. * * Disable runtime PM so we safely can check the device's runtime PM status and * if it is active, invoke its ->runtime_suspend callback to suspend it and * change its runtime PM status field to RPM_SUSPENDED. Also, if the device's * usage and children counters don't indicate that the device was in use before * the system-wide transition under way, decrement its parent's children counter * (if there is a parent). Keep runtime PM disabled to preserve the state * unless we encounter errors. * * Typically this function may be invoked from a system suspend callback to make * sure the device is put into low power state and it should only be used during * system-wide PM transitions to sleep states. It assumes that the analogous * pm_runtime_force_resume() will be used to resume the device. */ int pm_runtime_force_suspend(struct device *dev) { int (*callback)(struct device *); int ret; pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev) || dev->power.needs_force_resume) return 0; callback = GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); ret = callback ? callback(dev) : 0; if (ret) goto err; dev_pm_enable_wake_irq_complete(dev); /* * If the device can stay in suspend after the system-wide transition * to the working state that will follow, drop the children counter of * its parent and the usage counters of its suppliers. Otherwise, set * power.needs_force_resume to let pm_runtime_force_resume() know that * the device needs to be taken care of and to prevent this function * from handling the device again in case the device is passed to it * once more subsequently. */ if (pm_runtime_need_not_resume(dev)) pm_runtime_set_suspended(dev); else dev->power.needs_force_resume = true; return 0; err: dev_pm_disable_wake_irq_check(dev, true); pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); #ifdef CONFIG_PM_SLEEP /** * pm_runtime_force_resume - Force a device into resume state if needed. * @dev: Device to resume. * * This function expects that either pm_runtime_force_suspend() has put the * device into a low-power state prior to calling it, or the device had been * runtime-suspended before the preceding system-wide suspend transition and it * was left in suspend during that transition. * * The actions carried out by pm_runtime_force_suspend(), or by a runtime * suspend in general, are reversed and the device is brought back into full * power if it is expected to be used on system resume, which is the case when * its needs_force_resume flag is set or when its smart_suspend flag is set and * its runtime PM status is "active". * * In other cases, the resume is deferred to be managed via runtime PM. * * Typically, this function may be invoked from a system resume callback. */ int pm_runtime_force_resume(struct device *dev) { int (*callback)(struct device *); int ret = 0; if (!dev->power.needs_force_resume && (!dev_pm_smart_suspend(dev) || pm_runtime_status_suspended(dev))) goto out; callback = GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); ret = callback ? callback(dev) : 0; if (ret) { pm_runtime_set_suspended(dev); dev_pm_enable_wake_irq_check(dev, false); goto out; } pm_runtime_mark_last_busy(dev); out: /* * The smart_suspend flag can be cleared here because it is not going * to be necessary until the next system-wide suspend transition that * will update it again. */ dev->power.smart_suspend = false; /* * Also clear needs_force_resume to make this function skip devices that * have been seen by it once. */ dev->power.needs_force_resume = false; pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_resume); bool pm_runtime_need_not_resume(struct device *dev) { return atomic_read(&dev->power.usage_count) <= 1 && (atomic_read(&dev->power.child_count) == 0 || dev->power.ignore_children); } #endif /* CONFIG_PM_SLEEP */
6 6 6 5 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * * Module Name: exfldio - Aml Field I/O * * Copyright (C) 2000 - 2025, Intel Corp. * *****************************************************************************/ #include <acpi/acpi.h> #include "accommon.h" #include "acinterp.h" #include "amlcode.h" #include "acevents.h" #include "acdispat.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exfldio") /* Local prototypes */ static acpi_status acpi_ex_field_datum_io(union acpi_operand_object *obj_desc, u32 field_datum_byte_offset, u64 *value, u32 read_write); static u8 acpi_ex_register_overflow(union acpi_operand_object *obj_desc, u64 value); static acpi_status acpi_ex_setup_region(union acpi_operand_object *obj_desc, u32 field_datum_byte_offset); /******************************************************************************* * * FUNCTION: acpi_ex_setup_region * * PARAMETERS: obj_desc - Field to be read or written * field_datum_byte_offset - Byte offset of this datum within the * parent field * * RETURN: Status * * DESCRIPTION: Common processing for acpi_ex_extract_from_field and * acpi_ex_insert_into_field. Initialize the Region if necessary and * validate the request. * ******************************************************************************/ static acpi_status acpi_ex_setup_region(union acpi_operand_object *obj_desc, u32 field_datum_byte_offset) { acpi_status status = AE_OK; union acpi_operand_object *rgn_desc; u8 space_id; ACPI_FUNCTION_TRACE_U32(ex_setup_region, field_datum_byte_offset); rgn_desc = obj_desc->common_field.region_obj; /* We must have a valid region */ if (rgn_desc->common.type != ACPI_TYPE_REGION) { ACPI_ERROR((AE_INFO, "Needed Region, found type 0x%X (%s)", rgn_desc->common.type, acpi_ut_get_object_type_name(rgn_desc))); return_ACPI_STATUS(AE_AML_OPERAND_TYPE); } space_id = rgn_desc->region.space_id; /* Validate the Space ID */ if (!acpi_is_valid_space_id(space_id)) { ACPI_ERROR((AE_INFO, "Invalid/unknown Address Space ID: 0x%2.2X", space_id)); return_ACPI_STATUS(AE_AML_INVALID_SPACE_ID); } /* * If the Region Address and Length have not been previously evaluated, * evaluate them now and save the results. */ if (!(rgn_desc->common.flags & AOPOBJ_DATA_VALID)) { status = acpi_ds_get_region_arguments(rgn_desc); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } } /* * Exit now for SMBus, GSBus or IPMI address space, it has a non-linear * address space and the request cannot be directly validated */ if (space_id == ACPI_ADR_SPACE_SMBUS || space_id == ACPI_ADR_SPACE_GSBUS || space_id == ACPI_ADR_SPACE_IPMI) { /* SMBus or IPMI has a non-linear address space */ return_ACPI_STATUS(AE_OK); } #ifdef ACPI_UNDER_DEVELOPMENT /* * If the Field access is any_acc, we can now compute the optimal * access (because we know the length of the parent region) */ if (!(obj_desc->common.flags & AOPOBJ_DATA_VALID)) { if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } } #endif /* * Validate the request. The entire request from the byte offset for a * length of one field datum (access width) must fit within the region. * (Region length is specified in bytes) */ if (rgn_desc->region.length < (obj_desc->common_field.base_byte_offset + field_datum_byte_offset + obj_desc->common_field.access_byte_width)) { if (acpi_gbl_enable_interpreter_slack) { /* * Slack mode only: We will go ahead and allow access to this * field if it is within the region length rounded up to the next * access width boundary. acpi_size cast for 64-bit compile. */ if (ACPI_ROUND_UP(rgn_desc->region.length, obj_desc->common_field. access_byte_width) >= ((acpi_size)obj_desc->common_field. base_byte_offset + obj_desc->common_field.access_byte_width + field_datum_byte_offset)) { return_ACPI_STATUS(AE_OK); } } if (rgn_desc->region.length < obj_desc->common_field.access_byte_width) { /* * This is the case where the access_type (acc_word, etc.) is wider * than the region itself. For example, a region of length one * byte, and a field with Dword access specified. */ ACPI_ERROR((AE_INFO, "Field [%4.4s] access width (%u bytes) " "too large for region [%4.4s] (length %u)", acpi_ut_get_node_name(obj_desc-> common_field.node), obj_desc->common_field.access_byte_width, acpi_ut_get_node_name(rgn_desc->region. node), rgn_desc->region.length)); } /* * Offset rounded up to next multiple of field width * exceeds region length, indicate an error */ ACPI_ERROR((AE_INFO, "Field [%4.4s] Base+Offset+Width %u+%u+%u " "is beyond end of region [%4.4s] (length %u)", acpi_ut_get_node_name(obj_desc->common_field.node), obj_desc->common_field.base_byte_offset, field_datum_byte_offset, obj_desc->common_field.access_byte_width, acpi_ut_get_node_name(rgn_desc->region.node), rgn_desc->region.length)); return_ACPI_STATUS(AE_AML_REGION_LIMIT); } return_ACPI_STATUS(AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ex_access_region * * PARAMETERS: obj_desc - Field to be read * field_datum_byte_offset - Byte offset of this datum within the * parent field * value - Where to store value (must at least * 64 bits) * function - Read or Write flag plus other region- * dependent flags * * RETURN: Status * * DESCRIPTION: Read or Write a single field datum to an Operation Region. * ******************************************************************************/ acpi_status acpi_ex_access_region(union acpi_operand_object *obj_desc, u32 field_datum_byte_offset, u64 *value, u32 function) { acpi_status status; union acpi_operand_object *rgn_desc; u32 region_offset; ACPI_FUNCTION_TRACE(ex_access_region); /* * Ensure that the region operands are fully evaluated and verify * the validity of the request */ status = acpi_ex_setup_region(obj_desc, field_datum_byte_offset); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } /* * The physical address of this field datum is: * * 1) The base of the region, plus * 2) The base offset of the field, plus * 3) The current offset into the field */ rgn_desc = obj_desc->common_field.region_obj; region_offset = obj_desc->common_field.base_byte_offset + field_datum_byte_offset; if ((function & ACPI_IO_MASK) == ACPI_READ) { ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "[READ]")); } else { ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "[WRITE]")); } ACPI_DEBUG_PRINT_RAW((ACPI_DB_BFIELD, " Region [%s:%X], Width %X, ByteBase %X, Offset %X at %8.8X%8.8X\n", acpi_ut_get_region_name(rgn_desc->region. space_id), rgn_desc->region.space_id, obj_desc->common_field.access_byte_width, obj_desc->common_field.base_byte_offset, field_datum_byte_offset, ACPI_FORMAT_UINT64(rgn_desc->region.address + region_offset))); /* Invoke the appropriate address_space/op_region handler */ status = acpi_ev_address_space_dispatch(rgn_desc, obj_desc, function, region_offset, ACPI_MUL_8(obj_desc-> common_field. access_byte_width), value); if (ACPI_FAILURE(status)) { if (status == AE_NOT_IMPLEMENTED) { ACPI_ERROR((AE_INFO, "Region %s (ID=%u) not implemented", acpi_ut_get_region_name(rgn_desc->region. space_id), rgn_desc->region.space_id)); } else if (status == AE_NOT_EXIST) { ACPI_ERROR((AE_INFO, "Region %s (ID=%u) has no handler", acpi_ut_get_region_name(rgn_desc->region. space_id), rgn_desc->region.space_id)); } } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ex_register_overflow * * PARAMETERS: obj_desc - Register(Field) to be written * value - Value to be stored * * RETURN: TRUE if value overflows the field, FALSE otherwise * * DESCRIPTION: Check if a value is out of range of the field being written. * Used to check if the values written to Index and Bank registers * are out of range. Normally, the value is simply truncated * to fit the field, but this case is most likely a serious * coding error in the ASL. * ******************************************************************************/ static u8 acpi_ex_register_overflow(union acpi_operand_object *obj_desc, u64 value) { if (obj_desc->common_field.bit_length >= ACPI_INTEGER_BIT_SIZE) { /* * The field is large enough to hold the maximum integer, so we can * never overflow it. */ return (FALSE); } if (value >= ((u64) 1 << obj_desc->common_field.bit_length)) { /* * The Value is larger than the maximum value that can fit into * the register. */ ACPI_ERROR((AE_INFO, "Index value 0x%8.8X%8.8X overflows field width 0x%X", ACPI_FORMAT_UINT64(value), obj_desc->common_field.bit_length)); return (TRUE); } /* The Value will fit into the field with no truncation */ return (FALSE); } /******************************************************************************* * * FUNCTION: acpi_ex_field_datum_io * * PARAMETERS: obj_desc - Field to be read * field_datum_byte_offset - Byte offset of this datum within the * parent field * value - Where to store value (must be 64 bits) * read_write - Read or Write flag * * RETURN: Status * * DESCRIPTION: Read or Write a single datum of a field. The field_type is * demultiplexed here to handle the different types of fields * (buffer_field, region_field, index_field, bank_field) * ******************************************************************************/ static acpi_status acpi_ex_field_datum_io(union acpi_operand_object *obj_desc, u32 field_datum_byte_offset, u64 *value, u32 read_write) { acpi_status status; u64 local_value; ACPI_FUNCTION_TRACE_U32(ex_field_datum_io, field_datum_byte_offset); if (read_write == ACPI_READ) { if (!value) { local_value = 0; /* To support reads without saving return value */ value = &local_value; } /* Clear the entire return buffer first, [Very Important!] */ *value = 0; } /* * The four types of fields are: * * buffer_field - Read/write from/to a Buffer * region_field - Read/write from/to a Operation Region. * bank_field - Write to a Bank Register, then read/write from/to an * operation_region * index_field - Write to an Index Register, then read/write from/to a * Data Register */ switch (obj_desc->common.type) { case ACPI_TYPE_BUFFER_FIELD: /* * If the buffer_field arguments have not been previously evaluated, * evaluate them now and save the results. */ if (!(obj_desc->common.flags & AOPOBJ_DATA_VALID)) { status = acpi_ds_get_buffer_field_arguments(obj_desc); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } } if (read_write == ACPI_READ) { /* * Copy the data from the source buffer. * Length is the field width in bytes. */ memcpy(value, (obj_desc->buffer_field.buffer_obj)->buffer. pointer + obj_desc->buffer_field.base_byte_offset + field_datum_byte_offset, obj_desc->common_field.access_byte_width); } else { /* * Copy the data to the target buffer. * Length is the field width in bytes. */ memcpy((obj_desc->buffer_field.buffer_obj)->buffer. pointer + obj_desc->buffer_field.base_byte_offset + field_datum_byte_offset, value, obj_desc->common_field.access_byte_width); } status = AE_OK; break; case ACPI_TYPE_LOCAL_BANK_FIELD: /* * Ensure that the bank_value is not beyond the capacity of * the register */ if (acpi_ex_register_overflow(obj_desc->bank_field.bank_obj, (u64) obj_desc->bank_field. value)) { return_ACPI_STATUS(AE_AML_REGISTER_LIMIT); } /* * For bank_fields, we must write the bank_value to the bank_register * (itself a region_field) before we can access the data. */ status = acpi_ex_insert_into_field(obj_desc->bank_field.bank_obj, &obj_desc->bank_field.value, sizeof(obj_desc->bank_field. value)); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } /* * Now that the Bank has been selected, fall through to the * region_field case and write the datum to the Operation Region */ ACPI_FALLTHROUGH; case ACPI_TYPE_LOCAL_REGION_FIELD: /* * For simple region_fields, we just directly access the owning * Operation Region. */ status = acpi_ex_access_region(obj_desc, field_datum_byte_offset, value, read_write); break; case ACPI_TYPE_LOCAL_INDEX_FIELD: /* * Ensure that the index_value is not beyond the capacity of * the register */ if (acpi_ex_register_overflow(obj_desc->index_field.index_obj, (u64) obj_desc->index_field. value)) { return_ACPI_STATUS(AE_AML_REGISTER_LIMIT); } /* Write the index value to the index_register (itself a region_field) */ field_datum_byte_offset += obj_desc->index_field.value; ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "Write to Index Register: Value %8.8X\n", field_datum_byte_offset)); status = acpi_ex_insert_into_field(obj_desc->index_field.index_obj, &field_datum_byte_offset, sizeof(field_datum_byte_offset)); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } if (read_write == ACPI_READ) { /* Read the datum from the data_register */ ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "Read from Data Register\n")); status = acpi_ex_extract_from_field(obj_desc->index_field. data_obj, value, sizeof(u64)); } else { /* Write the datum to the data_register */ ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "Write to Data Register: Value %8.8X%8.8X\n", ACPI_FORMAT_UINT64(*value))); status = acpi_ex_insert_into_field(obj_desc->index_field. data_obj, value, sizeof(u64)); } break; default: ACPI_ERROR((AE_INFO, "Wrong object type in field I/O %u", obj_desc->common.type)); status = AE_AML_INTERNAL; break; } if (ACPI_SUCCESS(status)) { if (read_write == ACPI_READ) { ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "Value Read %8.8X%8.8X, Width %u\n", ACPI_FORMAT_UINT64(*value), obj_desc->common_field. access_byte_width)); } else { ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "Value Written %8.8X%8.8X, Width %u\n", ACPI_FORMAT_UINT64(*value), obj_desc->common_field. access_byte_width)); } } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ex_write_with_update_rule * * PARAMETERS: obj_desc - Field to be written * mask - bitmask within field datum * field_value - Value to write * field_datum_byte_offset - Offset of datum within field * * RETURN: Status * * DESCRIPTION: Apply the field update rule to a field write * ******************************************************************************/ acpi_status acpi_ex_write_with_update_rule(union acpi_operand_object *obj_desc, u64 mask, u64 field_value, u32 field_datum_byte_offset) { acpi_status status = AE_OK; u64 merged_value; u64 current_value; ACPI_FUNCTION_TRACE_U32(ex_write_with_update_rule, mask); /* Start with the new bits */ merged_value = field_value; /* If the mask is all ones, we don't need to worry about the update rule */ if (mask != ACPI_UINT64_MAX) { /* Decode the update rule */ switch (obj_desc->common_field. field_flags & AML_FIELD_UPDATE_RULE_MASK) { case AML_FIELD_UPDATE_PRESERVE: /* * Check if update rule needs to be applied (not if mask is all * ones) The left shift drops the bits we want to ignore. */ if ((~mask << (ACPI_MUL_8(sizeof(mask)) - ACPI_MUL_8(obj_desc->common_field. access_byte_width))) != 0) { /* * Read the current contents of the byte/word/dword containing * the field, and merge with the new field value. */ status = acpi_ex_field_datum_io(obj_desc, field_datum_byte_offset, &current_value, ACPI_READ); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } merged_value |= (current_value & ~mask); } break; case AML_FIELD_UPDATE_WRITE_AS_ONES: /* Set positions outside the field to all ones */ merged_value |= ~mask; break; case AML_FIELD_UPDATE_WRITE_AS_ZEROS: /* Set positions outside the field to all zeros */ merged_value &= mask; break; default: ACPI_ERROR((AE_INFO, "Unknown UpdateRule value: 0x%X", (obj_desc->common_field.field_flags & AML_FIELD_UPDATE_RULE_MASK))); return_ACPI_STATUS(AE_AML_OPERAND_VALUE); } } ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "Mask %8.8X%8.8X, DatumOffset %X, Width %X, " "Value %8.8X%8.8X, MergedValue %8.8X%8.8X\n", ACPI_FORMAT_UINT64(mask), field_datum_byte_offset, obj_desc->common_field.access_byte_width, ACPI_FORMAT_UINT64(field_value), ACPI_FORMAT_UINT64(merged_value))); /* Write the merged value */ status = acpi_ex_field_datum_io(obj_desc, field_datum_byte_offset, &merged_value, ACPI_WRITE); return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_ex_extract_from_field * * PARAMETERS: obj_desc - Field to be read * buffer - Where to store the field data * buffer_length - Length of Buffer * * RETURN: Status * * DESCRIPTION: Retrieve the current value of the given field * ******************************************************************************/ acpi_status acpi_ex_extract_from_field(union acpi_operand_object *obj_desc, void *buffer, u32 buffer_length) { acpi_status status; u64 raw_datum; u64 merged_datum; u32 field_offset = 0; u32 buffer_offset = 0; u32 buffer_tail_bits; u32 datum_count; u32 field_datum_count; u32 access_bit_width; u32 i; ACPI_FUNCTION_TRACE(ex_extract_from_field); /* Validate target buffer and clear it */ if (buffer_length < ACPI_ROUND_BITS_UP_TO_BYTES(obj_desc->common_field.bit_length)) { ACPI_ERROR((AE_INFO, "Field size %u (bits) is too large for buffer (%u)", obj_desc->common_field.bit_length, buffer_length)); return_ACPI_STATUS(AE_BUFFER_OVERFLOW); } memset(buffer, 0, buffer_length); access_bit_width = ACPI_MUL_8(obj_desc->common_field.access_byte_width); /* Handle the simple case here */ if ((obj_desc->common_field.start_field_bit_offset == 0) && (obj_desc->common_field.bit_length == access_bit_width)) { if (buffer_length >= sizeof(u64)) { status = acpi_ex_field_datum_io(obj_desc, 0, buffer, ACPI_READ); } else { /* Use raw_datum (u64) to handle buffers < 64 bits */ status = acpi_ex_field_datum_io(obj_desc, 0, &raw_datum, ACPI_READ); memcpy(buffer, &raw_datum, buffer_length); } return_ACPI_STATUS(status); } /* TBD: Move to common setup code */ /* Field algorithm is limited to sizeof(u64), truncate if needed */ if (obj_desc->common_field.access_byte_width > sizeof(u64)) { obj_desc->common_field.access_byte_width = sizeof(u64); access_bit_width = sizeof(u64) * 8; } /* Compute the number of datums (access width data items) */ datum_count = ACPI_ROUND_UP_TO(obj_desc->common_field.bit_length, access_bit_width); field_datum_count = ACPI_ROUND_UP_TO(obj_desc->common_field.bit_length + obj_desc->common_field. start_field_bit_offset, access_bit_width); /* Priming read from the field */ status = acpi_ex_field_datum_io(obj_desc, field_offset, &raw_datum, ACPI_READ); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } merged_datum = raw_datum >> obj_desc->common_field.start_field_bit_offset; /* Read the rest of the field */ for (i = 1; i < field_datum_count; i++) { /* Get next input datum from the field */ field_offset += obj_desc->common_field.access_byte_width; status = acpi_ex_field_datum_io(obj_desc, field_offset, &raw_datum, ACPI_READ); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } /* * Merge with previous datum if necessary. * * Note: Before the shift, check if the shift value will be larger than * the integer size. If so, there is no need to perform the operation. * This avoids the differences in behavior between different compilers * concerning shift values larger than the target data width. */ if (access_bit_width - obj_desc->common_field.start_field_bit_offset < ACPI_INTEGER_BIT_SIZE) { merged_datum |= raw_datum << (access_bit_width - obj_desc->common_field. start_field_bit_offset); } if (i == datum_count) { break; } /* Write merged datum to target buffer */ memcpy(((char *)buffer) + buffer_offset, &merged_datum, ACPI_MIN(obj_desc->common_field.access_byte_width, buffer_length - buffer_offset)); buffer_offset += obj_desc->common_field.access_byte_width; merged_datum = raw_datum >> obj_desc->common_field.start_field_bit_offset; } /* Mask off any extra bits in the last datum */ buffer_tail_bits = obj_desc->common_field.bit_length % access_bit_width; if (buffer_tail_bits) { merged_datum &= ACPI_MASK_BITS_ABOVE(buffer_tail_bits); } /* Write the last datum to the buffer */ memcpy(((char *)buffer) + buffer_offset, &merged_datum, ACPI_MIN(obj_desc->common_field.access_byte_width, buffer_length - buffer_offset)); return_ACPI_STATUS(AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ex_insert_into_field * * PARAMETERS: obj_desc - Field to be written * buffer - Data to be written * buffer_length - Length of Buffer * * RETURN: Status * * DESCRIPTION: Store the Buffer contents into the given field * ******************************************************************************/ acpi_status acpi_ex_insert_into_field(union acpi_operand_object *obj_desc, void *buffer, u32 buffer_length) { void *new_buffer; acpi_status status; u64 mask; u64 width_mask; u64 merged_datum; u64 raw_datum = 0; u32 field_offset = 0; u32 buffer_offset = 0; u32 buffer_tail_bits; u32 datum_count; u32 field_datum_count; u32 access_bit_width; u32 required_length; u32 i; ACPI_FUNCTION_TRACE(ex_insert_into_field); /* Validate input buffer */ new_buffer = NULL; required_length = ACPI_ROUND_BITS_UP_TO_BYTES(obj_desc->common_field.bit_length); /* * We must have a buffer that is at least as long as the field * we are writing to. This is because individual fields are * indivisible and partial writes are not supported -- as per * the ACPI specification. */ if (buffer_length < required_length) { /* We need to create a new buffer */ new_buffer = ACPI_ALLOCATE_ZEROED(required_length); if (!new_buffer) { return_ACPI_STATUS(AE_NO_MEMORY); } /* * Copy the original data to the new buffer, starting * at Byte zero. All unused (upper) bytes of the * buffer will be 0. */ memcpy((char *)new_buffer, (char *)buffer, buffer_length); buffer = new_buffer; buffer_length = required_length; } /* TBD: Move to common setup code */ /* Algo is limited to sizeof(u64), so cut the access_byte_width */ if (obj_desc->common_field.access_byte_width > sizeof(u64)) { obj_desc->common_field.access_byte_width = sizeof(u64); } access_bit_width = ACPI_MUL_8(obj_desc->common_field.access_byte_width); /* Create the bitmasks used for bit insertion */ width_mask = ACPI_MASK_BITS_ABOVE_64(access_bit_width); mask = width_mask & ACPI_MASK_BITS_BELOW(obj_desc->common_field.start_field_bit_offset); /* Compute the number of datums (access width data items) */ datum_count = ACPI_ROUND_UP_TO(obj_desc->common_field.bit_length, access_bit_width); field_datum_count = ACPI_ROUND_UP_TO(obj_desc->common_field.bit_length + obj_desc->common_field. start_field_bit_offset, access_bit_width); /* Get initial Datum from the input buffer */ memcpy(&raw_datum, buffer, ACPI_MIN(obj_desc->common_field.access_byte_width, buffer_length - buffer_offset)); merged_datum = raw_datum << obj_desc->common_field.start_field_bit_offset; /* Write the entire field */ for (i = 1; i < field_datum_count; i++) { /* Write merged datum to the target field */ merged_datum &= mask; status = acpi_ex_write_with_update_rule(obj_desc, mask, merged_datum, field_offset); if (ACPI_FAILURE(status)) { goto exit; } field_offset += obj_desc->common_field.access_byte_width; /* * Start new output datum by merging with previous input datum * if necessary. * * Note: Before the shift, check if the shift value will be larger than * the integer size. If so, there is no need to perform the operation. * This avoids the differences in behavior between different compilers * concerning shift values larger than the target data width. */ if ((access_bit_width - obj_desc->common_field.start_field_bit_offset) < ACPI_INTEGER_BIT_SIZE) { merged_datum = raw_datum >> (access_bit_width - obj_desc->common_field. start_field_bit_offset); } else { merged_datum = 0; } mask = width_mask; if (i == datum_count) { break; } /* Get the next input datum from the buffer */ buffer_offset += obj_desc->common_field.access_byte_width; memcpy(&raw_datum, ((char *)buffer) + buffer_offset, ACPI_MIN(obj_desc->common_field.access_byte_width, buffer_length - buffer_offset)); merged_datum |= raw_datum << obj_desc->common_field.start_field_bit_offset; } /* Mask off any extra bits in the last datum */ buffer_tail_bits = (obj_desc->common_field.bit_length + obj_desc->common_field.start_field_bit_offset) % access_bit_width; if (buffer_tail_bits) { mask &= ACPI_MASK_BITS_ABOVE(buffer_tail_bits); } /* Write the last datum to the field */ merged_datum &= mask; status = acpi_ex_write_with_update_rule(obj_desc, mask, merged_datum, field_offset); exit: /* Free temporary buffer if we used one */ if (new_buffer) { ACPI_FREE(new_buffer); } return_ACPI_STATUS(status); }
11 9 1 1 11 10 8 9 8 9 8 9 1 789 792 12 12 1 1 1 12 5 5 5 5 5 12 11 11 7 4 4 3 11 2 11 12 13 13 12 12 11 6 5 4 5 8 2 1 1 1 8 4 5 12 14 3 1 3 25 27 26 9 8 7 3 4 1 6 3 1 3 3 3 6 6 1 6 2 1 1 1 7 2 1 1 2 1 1 2 1 1 1 3 2 2 1 3 2 1 2 1 1 17 16 15 14 5 5 3 2 4 1 1 1 1 1 1 1 1 1 3 2 1 1 1 1 5 17 1 2 1 6 5 3 2 1 2 3 2 2 2 2 2 2 2 6 3 1 2 1 1 1 1 1 1 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* raw.c - Raw sockets for protocol family CAN * * Copyright (c) 2002-2007 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */ #include <linux/can/skb.h> #include <linux/can/raw.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN raw protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>"); MODULE_ALIAS("can-proto-1"); #define RAW_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex) #define MASK_ALL 0 /* A raw socket has a list of can_filters attached to it, each receiving * the CAN frames matching that filter. If the filter list is empty, * no CAN frames will be received by the socket. The default after * opening the socket, is to have one filter which receives all frames. * The filter list is allocated dynamically with the exception of the * list containing only one item. This common case is optimized by * storing the single filter in dfilter, to avoid using dynamic memory. */ struct uniqframe { const struct sk_buff *skb; int skbcnt; unsigned int join_rx_count; }; struct raw_sock { struct sock sk; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head notifier; int ifindex; unsigned int bound:1; unsigned int loopback:1; unsigned int recv_own_msgs:1; unsigned int fd_frames:1; unsigned int xl_frames:1; unsigned int join_filters:1; struct can_raw_vcid_options raw_vcid_opts; canid_t tx_vcid_shifted; canid_t rx_vcid_shifted; canid_t rx_vcid_mask_shifted; can_err_mask_t err_mask; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ struct uniqframe __percpu *uniq; }; static LIST_HEAD(raw_notifier_list); static DEFINE_SPINLOCK(raw_notifier_lock); static struct raw_sock *raw_busy_notifier; /* Return pointer to store the extra msg flags for raw_recvmsg(). * We use the space of one unsigned int beyond the 'struct sockaddr_can' * in skb->cb. */ static inline unsigned int *raw_flags(struct sk_buff *skb) { sock_skb_cb_check_size(sizeof(struct sockaddr_can) + sizeof(unsigned int)); /* return pointer after struct sockaddr_can */ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); } static inline struct raw_sock *raw_sk(const struct sock *sk) { return (struct raw_sock *)sk; } static void raw_rcv(struct sk_buff *oskb, void *data) { struct sock *sk = (struct sock *)data; struct raw_sock *ro = raw_sk(sk); enum skb_drop_reason reason; struct sockaddr_can *addr; struct sk_buff *skb; unsigned int *pflags; /* check the received tx sock reference */ if (!ro->recv_own_msgs && oskb->sk == sk) return; /* make sure to not pass oversized frames to the socket */ if (!ro->fd_frames && can_is_canfd_skb(oskb)) return; if (can_is_canxl_skb(oskb)) { struct canxl_frame *cxl = (struct canxl_frame *)oskb->data; /* make sure to not pass oversized frames to the socket */ if (!ro->xl_frames) return; /* filter CAN XL VCID content */ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) { /* apply VCID filter if user enabled the filter */ if ((cxl->prio & ro->rx_vcid_mask_shifted) != (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted)) return; } else { /* no filter => do not forward VCID tagged frames */ if (cxl->prio & CANXL_VCID_MASK) return; } } /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { if (!ro->join_filters) return; this_cpu_inc(ro->uniq->join_rx_count); /* drop frame until all enabled filters matched */ if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count) return; } else { this_cpu_ptr(ro->uniq)->skb = oskb; this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt; this_cpu_ptr(ro->uniq)->join_rx_count = 1; /* drop first frame to check all enabled filters? */ if (ro->join_filters && ro->count > 1) return; } /* clone the given skb to be able to enqueue it into the rcv queue */ skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) return; /* Put the datagram to the queue so that raw_recvmsg() can get * it from there. We need to pass the interface index to * raw_recvmsg(). We pass a whole struct sockaddr_can in * skb->cb containing the interface index. */ sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; /* add CAN specific message flags for raw_recvmsg() */ pflags = raw_flags(skb); *pflags = 0; if (oskb->sk) *pflags |= MSG_DONTROUTE; if (oskb->sk == sk) *pflags |= MSG_CONFIRM; if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) sk_skb_reason_drop(sk, skb, reason); } static int raw_enable_filters(struct net *net, struct net_device *dev, struct sock *sk, struct can_filter *filter, int count) { int err = 0; int i; for (i = 0; i < count; i++) { err = can_rx_register(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk, "raw", sk); if (err) { /* clean up successfully registered filters */ while (--i >= 0) can_rx_unregister(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk); break; } } return err; } static int raw_enable_errfilter(struct net *net, struct net_device *dev, struct sock *sk, can_err_mask_t err_mask) { int err = 0; if (err_mask) err = can_rx_register(net, dev, 0, err_mask | CAN_ERR_FLAG, raw_rcv, sk, "raw", sk); return err; } static void raw_disable_filters(struct net *net, struct net_device *dev, struct sock *sk, struct can_filter *filter, int count) { int i; for (i = 0; i < count; i++) can_rx_unregister(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk); } static inline void raw_disable_errfilter(struct net *net, struct net_device *dev, struct sock *sk, can_err_mask_t err_mask) { if (err_mask) can_rx_unregister(net, dev, 0, err_mask | CAN_ERR_FLAG, raw_rcv, sk); } static inline void raw_disable_allfilters(struct net *net, struct net_device *dev, struct sock *sk) { struct raw_sock *ro = raw_sk(sk); raw_disable_filters(net, dev, sk, ro->filter, ro->count); raw_disable_errfilter(net, dev, sk, ro->err_mask); } static int raw_enable_allfilters(struct net *net, struct net_device *dev, struct sock *sk) { struct raw_sock *ro = raw_sk(sk); int err; err = raw_enable_filters(net, dev, sk, ro->filter, ro->count); if (!err) { err = raw_enable_errfilter(net, dev, sk, ro->err_mask); if (err) raw_disable_filters(net, dev, sk, ro->filter, ro->count); } return err; } static void raw_notify(struct raw_sock *ro, unsigned long msg, struct net_device *dev) { struct sock *sk = &ro->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (ro->dev != dev) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (ro->bound) { raw_disable_allfilters(dev_net(dev), dev, sk); netdev_put(dev, &ro->dev_tracker); } if (ro->count > 1) kfree(ro->filter); ro->ifindex = 0; ro->bound = 0; ro->dev = NULL; ro->count = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int raw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&raw_notifier_lock); list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) { spin_unlock(&raw_notifier_lock); raw_notify(raw_busy_notifier, msg, dev); spin_lock(&raw_notifier_lock); } raw_busy_notifier = NULL; spin_unlock(&raw_notifier_lock); return NOTIFY_DONE; } static int raw_init(struct sock *sk) { struct raw_sock *ro = raw_sk(sk); ro->bound = 0; ro->ifindex = 0; ro->dev = NULL; /* set default filter to single entry dfilter */ ro->dfilter.can_id = 0; ro->dfilter.can_mask = MASK_ALL; ro->filter = &ro->dfilter; ro->count = 1; /* set default loopback behaviour */ ro->loopback = 1; ro->recv_own_msgs = 0; ro->fd_frames = 0; ro->xl_frames = 0; ro->join_filters = 0; /* alloc_percpu provides zero'ed memory */ ro->uniq = alloc_percpu(struct uniqframe); if (unlikely(!ro->uniq)) return -ENOMEM; /* set notifier */ spin_lock(&raw_notifier_lock); list_add_tail(&ro->notifier, &raw_notifier_list); spin_unlock(&raw_notifier_lock); return 0; } static int raw_release(struct socket *sock) { struct sock *sk = sock->sk; struct raw_sock *ro; struct net *net; if (!sk) return 0; ro = raw_sk(sk); net = sock_net(sk); spin_lock(&raw_notifier_lock); while (raw_busy_notifier == ro) { spin_unlock(&raw_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&raw_notifier_lock); } list_del(&ro->notifier); spin_unlock(&raw_notifier_lock); rtnl_lock(); lock_sock(sk); /* remove current filters & unregister */ if (ro->bound) { if (ro->dev) { raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); netdev_put(ro->dev, &ro->dev_tracker); } else { raw_disable_allfilters(net, NULL, sk); } } if (ro->count > 1) kfree(ro->filter); ro->ifindex = 0; ro->bound = 0; ro->dev = NULL; ro->count = 0; free_percpu(ro->uniq); sock_orphan(sk); sock->sk = NULL; release_sock(sk); rtnl_unlock(); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; } static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct net_device *dev = NULL; int ifindex; int err = 0; int notify_enetdown = 0; if (len < RAW_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; rtnl_lock(); lock_sock(sk); if (ro->bound && addr->can_ifindex == ro->ifindex) goto out; if (addr->can_ifindex) { dev = dev_get_by_index(sock_net(sk), addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { err = -ENODEV; goto out_put_dev; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; /* filters set by default/setsockopt */ err = raw_enable_allfilters(sock_net(sk), dev, sk); if (err) goto out_put_dev; } else { ifindex = 0; /* filters set by default/setsockopt */ err = raw_enable_allfilters(sock_net(sk), NULL, sk); } if (!err) { if (ro->bound) { /* unregister old filters */ if (ro->dev) { raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); /* drop reference to old ro->dev */ netdev_put(ro->dev, &ro->dev_tracker); } else { raw_disable_allfilters(sock_net(sk), NULL, sk); } } ro->ifindex = ifindex; ro->bound = 1; /* bind() ok -> hold a reference for new ro->dev */ ro->dev = dev; if (ro->dev) netdev_hold(ro->dev, &ro->dev_tracker, GFP_KERNEL); } out_put_dev: /* remove potential reference from dev_get_by_index() */ dev_put(dev); out: release_sock(sk); rtnl_unlock(); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int raw_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, RAW_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = ro->ifindex; return RAW_MIN_NAMELEN; } static int raw_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct can_filter *filter = NULL; /* dyn. alloc'ed filters */ struct can_filter sfilter; /* single filter */ struct net_device *dev = NULL; can_err_mask_t err_mask = 0; int count = 0; int flag; int err = 0; if (level != SOL_CAN_RAW) return -EINVAL; switch (optname) { case CAN_RAW_FILTER: if (optlen % sizeof(struct can_filter) != 0) return -EINVAL; if (optlen > CAN_RAW_FILTER_MAX * sizeof(struct can_filter)) return -EINVAL; count = optlen / sizeof(struct can_filter); if (count > 1) { /* filter does not fit into dfilter => alloc space */ filter = memdup_sockptr(optval, optlen); if (IS_ERR(filter)) return PTR_ERR(filter); } else if (count == 1) { if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter))) return -EFAULT; } rtnl_lock(); lock_sock(sk); dev = ro->dev; if (ro->bound && dev) { if (dev->reg_state != NETREG_REGISTERED) { if (count > 1) kfree(filter); err = -ENODEV; goto out_fil; } } if (ro->bound) { /* (try to) register the new filters */ if (count == 1) err = raw_enable_filters(sock_net(sk), dev, sk, &sfilter, 1); else err = raw_enable_filters(sock_net(sk), dev, sk, filter, count); if (err) { if (count > 1) kfree(filter); goto out_fil; } /* remove old filter registrations */ raw_disable_filters(sock_net(sk), dev, sk, ro->filter, ro->count); } /* remove old filter space */ if (ro->count > 1) kfree(ro->filter); /* link new filters to the socket */ if (count == 1) { /* copy filter data for single filter */ ro->dfilter = sfilter; filter = &ro->dfilter; } ro->filter = filter; ro->count = count; out_fil: release_sock(sk); rtnl_unlock(); break; case CAN_RAW_ERR_FILTER: if (optlen != sizeof(err_mask)) return -EINVAL; if (copy_from_sockptr(&err_mask, optval, optlen)) return -EFAULT; err_mask &= CAN_ERR_MASK; rtnl_lock(); lock_sock(sk); dev = ro->dev; if (ro->bound && dev) { if (dev->reg_state != NETREG_REGISTERED) { err = -ENODEV; goto out_err; } } /* remove current error mask */ if (ro->bound) { /* (try to) register the new err_mask */ err = raw_enable_errfilter(sock_net(sk), dev, sk, err_mask); if (err) goto out_err; /* remove old err_mask registration */ raw_disable_errfilter(sock_net(sk), dev, sk, ro->err_mask); } /* link new err_mask to the socket */ ro->err_mask = err_mask; out_err: release_sock(sk); rtnl_unlock(); break; case CAN_RAW_LOOPBACK: if (optlen != sizeof(flag)) return -EINVAL; if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; ro->loopback = !!flag; break; case CAN_RAW_RECV_OWN_MSGS: if (optlen != sizeof(flag)) return -EINVAL; if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; ro->recv_own_msgs = !!flag; break; case CAN_RAW_FD_FRAMES: if (optlen != sizeof(flag)) return -EINVAL; if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames && !flag) return -EINVAL; ro->fd_frames = !!flag; break; case CAN_RAW_XL_FRAMES: if (optlen != sizeof(flag)) return -EINVAL; if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; ro->xl_frames = !!flag; /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames) ro->fd_frames = ro->xl_frames; break; case CAN_RAW_XL_VCID_OPTS: if (optlen != sizeof(ro->raw_vcid_opts)) return -EINVAL; if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen)) return -EFAULT; /* prepare 32 bit values for handling in hot path */ ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET; ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET; ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET; break; case CAN_RAW_JOIN_FILTERS: if (optlen != sizeof(flag)) return -EINVAL; if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; ro->join_filters = !!flag; break; default: return -ENOPROTOOPT; } return err; } static int raw_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); int flag; int len; void *val; if (level != SOL_CAN_RAW) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_RAW_FILTER: { int err = 0; lock_sock(sk); if (ro->count > 0) { int fsize = ro->count * sizeof(struct can_filter); /* user space buffer to small for filter list? */ if (len < fsize) { /* return -ERANGE and needed space in optlen */ err = -ERANGE; if (put_user(fsize, optlen)) err = -EFAULT; } else { if (len > fsize) len = fsize; if (copy_to_user(optval, ro->filter, len)) err = -EFAULT; } } else { len = 0; } release_sock(sk); if (!err) err = put_user(len, optlen); return err; } case CAN_RAW_ERR_FILTER: if (len > sizeof(can_err_mask_t)) len = sizeof(can_err_mask_t); val = &ro->err_mask; break; case CAN_RAW_LOOPBACK: if (len > sizeof(int)) len = sizeof(int); flag = ro->loopback; val = &flag; break; case CAN_RAW_RECV_OWN_MSGS: if (len > sizeof(int)) len = sizeof(int); flag = ro->recv_own_msgs; val = &flag; break; case CAN_RAW_FD_FRAMES: if (len > sizeof(int)) len = sizeof(int); flag = ro->fd_frames; val = &flag; break; case CAN_RAW_XL_FRAMES: if (len > sizeof(int)) len = sizeof(int); flag = ro->xl_frames; val = &flag; break; case CAN_RAW_XL_VCID_OPTS: { int err = 0; /* user space buffer to small for VCID opts? */ if (len < sizeof(ro->raw_vcid_opts)) { /* return -ERANGE and needed space in optlen */ err = -ERANGE; if (put_user(sizeof(ro->raw_vcid_opts), optlen)) err = -EFAULT; } else { if (len > sizeof(ro->raw_vcid_opts)) len = sizeof(ro->raw_vcid_opts); if (copy_to_user(optval, &ro->raw_vcid_opts, len)) err = -EFAULT; } if (!err) err = put_user(len, optlen); return err; } case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); flag = ro->join_filters; val = &flag; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) { struct canxl_frame *cxl = (struct canxl_frame *)skb->data; /* sanitize non CAN XL bits */ cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK); /* clear VCID in CAN XL frame if pass through is disabled */ if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS)) cxl->prio &= CANXL_PRIO_MASK; /* set VCID in CAN XL frame if enabled */ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) { cxl->prio &= CANXL_PRIO_MASK; cxl->prio |= ro->tx_vcid_shifted; } } static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) { /* Classical CAN -> no checks for flags and device capabilities */ if (can_is_can_skb(skb)) return CAN_MTU; /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ if (ro->fd_frames && can_is_canfd_skb(skb) && (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) return CANFD_MTU; /* CAN XL -> needs to be enabled and a CAN XL device */ if (ro->xl_frames && can_is_canxl_skb(skb) && can_is_canxl_dev_mtu(mtu)) return CANXL_MTU; return 0; } static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); struct sockcm_cookie sockc; struct sk_buff *skb; struct net_device *dev; unsigned int txmtu; int ifindex; int err = -EINVAL; /* check for valid CAN frame sizes */ if (size < CANXL_HDR_SIZE + CANXL_MIN_DLEN || size > CANXL_MTU) return -EINVAL; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < RAW_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; ifindex = addr->can_ifindex; } else { ifindex = ro->ifindex; } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENXIO; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto put_dev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; /* fill the skb before testing for valid CAN frames */ err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto free_skb; err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ txmtu = raw_check_txframe(ro, skb, READ_ONCE(dev->mtu)); if (!txmtu) goto free_skb; /* only CANXL: clear/forward/set VCID value */ if (txmtu == CANXL_MTU) raw_put_canxl_vcid(ro, skb); sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto free_skb; } skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, &sockc); err = can_send(skb, ro->loopback); dev_put(dev); if (err) goto send_failed; return size; free_skb: kfree_skb(skb); put_dev: dev_put(dev); send_failed: return err; } static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int err = 0; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sk, msg, size, SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &err); if (!skb) return err; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(RAW_MIN_NAMELEN); msg->msg_namelen = RAW_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* assign the flags that have been recorded in raw_rcv() */ msg->msg_flags |= *(raw_flags(skb)); skb_free_datagram(sk, skb); return size; } static int raw_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops raw_ops = { .family = PF_CAN, .release = raw_release, .bind = raw_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = raw_getname, .poll = datagram_poll, .ioctl = raw_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = raw_setsockopt, .getsockopt = raw_getsockopt, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .mmap = sock_no_mmap, }; static struct proto raw_proto __read_mostly = { .name = "CAN_RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct raw_sock), .init = raw_init, }; static const struct can_proto raw_can_proto = { .type = SOCK_RAW, .protocol = CAN_RAW, .ops = &raw_ops, .prot = &raw_proto, }; static struct notifier_block canraw_notifier = { .notifier_call = raw_notifier }; static __init int raw_module_init(void) { int err; pr_info("can: raw protocol\n"); err = register_netdevice_notifier(&canraw_notifier); if (err) return err; err = can_proto_register(&raw_can_proto); if (err < 0) { pr_err("can: registration of raw protocol failed\n"); goto register_proto_failed; } return 0; register_proto_failed: unregister_netdevice_notifier(&canraw_notifier); return err; } static __exit void raw_module_exit(void) { can_proto_unregister(&raw_can_proto); unregister_netdevice_notifier(&canraw_notifier); } module_init(raw_module_init); module_exit(raw_module_exit);
12 18 15 15 20 12 2327 319 58 5126 144 4076 20 43 349 525 520 5 520 137 526 11847 11839 14227 14274 14231 11892 11841 28 28 14274 28 28 7673 7663 11885 11894 11883 11886 6 12036 11888 33 33 33 33 33 33 33 33 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/local_lock.h> #include <linux/zswap.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER #define MAX_PAGE_ORDER 10 #else #define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif #define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) #define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) /* Defines the order for the number of pages that have a migrate type. */ #ifndef CONFIG_PAGE_BLOCK_MAX_ORDER #define PAGE_BLOCK_MAX_ORDER MAX_PAGE_ORDER #else #define PAGE_BLOCK_MAX_ORDER CONFIG_PAGE_BLOCK_MAX_ORDER #endif /* CONFIG_PAGE_BLOCK_MAX_ORDER */ /* * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_MAX_ORDER, * which defines the order for the number of pages that can have a migrate type */ #if (PAGE_BLOCK_MAX_ORDER > MAX_PAGE_ORDER) #error MAX_PAGE_ORDER must be >= PAGE_BLOCK_MAX_ORDER #endif /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. */ MIGRATE_CMA, __MIGRATE_TYPE_END = MIGRATE_CMA, #else __MIGRATE_TYPE_END = MIGRATE_HIGHATOMIC, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) /* * __dump_folio() in mm/debug.c passes a folio pointer to on-stack struct folio, * so folio_pfn() cannot be used and pfn is needed. */ # define is_migrate_cma_folio(folio, pfn) \ (get_pfnblock_migratetype(&folio->page, pfn) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false # define is_migrate_cma_folio(folio, pfn) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } /* * Check whether a migratetype can be merged with another migratetype. * * It is only mergeable when it can fall back to other migratetypes for * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. */ static inline bool migratetype_is_mergeable(int mt) { return mt < MIGRATE_PCPTYPES; } #define for_each_migratetype_order(order, type) \ for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define get_pageblock_migratetype(page) \ get_pfnblock_migratetype(page, page_to_pfn(page)) #define folio_migratetype(folio) \ get_pageblock_migratetype(&folio->page) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; struct pglist_data; #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_EVENT_ITEMS }; #else #define NR_VM_NUMA_EVENT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_FREE_PAGES_BLOCKS, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ /* Second 128 byte cacheline */ #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, #ifdef CONFIG_UNACCEPTED_MEMORY NR_UNACCEPTED, #endif NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_THROTTLED_WRITTEN, /* NR_WRITTEN while reclaim throttled */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */ #ifdef CONFIG_IOMMU_SUPPORT NR_IOMMU_PAGES, /* # of pages allocated by IOMMU */ #endif #ifdef CONFIG_SWAP NR_SWAPCACHE, #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ /** * Candidate pages for promotion based on hint fault latency. This * counter is used to control the promotion rate and adjust the hot * threshold. */ PGPROMOTE_CANDIDATE, /** * Not rate-limited (NRL) candidate pages for those can be promoted * without considering hot threshold because of enough free pages in * fast-tier node. These promotions bypass the regular hotness checks * and do NOT influence the promotion rate-limiter or * threshold-adjustment logic. * This is for statistics/monitoring purposes. */ PGPROMOTE_CANDIDATE_NRL, #endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, PGDEMOTE_PROACTIVE, #ifdef CONFIG_HUGETLB_PAGE NR_HUGETLB, #endif NR_BALLOON_PAGES, NR_KERNEL_FILE_PAGES, NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the item should be printed in THPs (/proc/vmstat * currently prints number of anon, file and shmem THPs. But the item * is charged in pages). */ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return false; return item == NR_ANON_THPS || item == NR_FILE_THPS || item == NR_SHMEM_THPS || item == NR_SHMEM_PMDMAPPED || item == NR_FILE_PMDMAPPED; } /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; enum vmscan_throttle_state { VMSCAN_THROTTLE_WRITEBACK, VMSCAN_THROTTLE_ISOLATED, VMSCAN_THROTTLE_NOPROGRESS, VMSCAN_THROTTLE_CONGESTED, NR_VMSCAN_THROTTLE, }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define WORKINGSET_ANON 0 #define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { /* * An lruvec has many dirty pages backed by a congested BDI: * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim. * It can be cleared by cgroup reclaim or kswapd. * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim. * It can only be cleared by kswapd. * * Essentially, kswapd can unthrottle an lruvec throttled by cgroup * reclaim, but not vice versa. This only applies to the root cgroup. * The goal is to prevent cgroup reclaim on the root cgroup (e.g. * memory.reclaim) to unthrottle an unbalanced node (that was throttled * by kswapd). */ LRUVEC_CGROUP_CONGESTED, LRUVEC_NODE_CONGESTED, }; #endif /* !__GENERATING_BOUNDS_H */ /* * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * * After a folio is faulted in, the aging needs to check the accessed bit at * least twice before handing this folio over to the eviction. The first check * clears the accessed bit from the initial fault; the second check makes sure * this folio hasn't been used since then. This process, AKA second chance, * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two * generations are considered active; the rest of generations, if they exist, * are considered inactive. See lru_gen_is_active(). * * PG_active is always cleared while a folio is on one of lrugen->folios[] so * that the sliding window needs not to worry about it. And it's set again when * a folio considered active is isolated for non-reclaiming purposes, e.g., * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* * Each generation is divided into multiple tiers. A folio accessed N times * through file descriptors is in tier order_base_2(N). A folio in the first * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by * PG_workingset. A folio in any other tier (1<N<5) between the first and last * is marked by additional bits of LRU_REFS_WIDTH in folio->flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, * comparisons of refaulted/(evicted+protected) from the first tier and the rest * infer whether folios accessed multiple times through file descriptors are * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) /* * For folios accessed multiple times through file descriptors, * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily * promoted into the second oldest generation in the eviction path. And when * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is * only valid when PG_referenced is set. * * For folios accessed multiple times through page tables, folio_update_gen() * from a page table walk or lru_gen_set_refs() from a rmap walk sets * PG_referenced after the accessed bit is cleared for the first time. * Thereafter, those two paths set PG_workingset and promote folios to the * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. * * For both cases above, after PG_workingset is set on a folio, it remains until * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It * can be set again if lru_gen_test_recent() returns true upon a refault. */ #define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) struct lruvec; struct page_vma_mapped_walk; #ifdef CONFIG_LRU_GEN enum { LRU_GEN_ANON, LRU_GEN_FILE, }; enum { LRU_GEN_CORE, LRU_GEN_MM_WALK, LRU_GEN_NONLEAF_YOUNG, NR_LRU_GEN_CAPS }; #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) /* whether to keep historical stats from evicted generations */ #ifdef CONFIG_LRU_GEN_STATS #define NR_HIST_GENS MAX_NR_GENS #else #define NR_HIST_GENS 1U #endif /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are * stored in min_seq[] separately for anon and file types so that they can be * incremented independently. Ideally min_seq[] are kept in sync when both anon * and file types are evictable. However, to adapt to situations like extreme * swappiness, they are allowed to be out of sync by at most * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ unsigned long min_seq[ANON_AND_FILE]; /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; /* can only be modified under the LRU lock */ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; }; enum { MM_LEAF_TOTAL, /* total leaf entries */ MM_LEAF_YOUNG, /* young leaf entries */ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */ NR_MM_STATS }; /* double-buffering Bloom filters */ #define NR_BLOOM_FILTERS 2 struct lru_gen_mm_state { /* synced with max_seq after each iteration */ unsigned long seq; /* where the current iteration continues after */ struct list_head *head; /* where the last iteration ended before */ struct list_head *tail; /* Bloom filters flip after each iteration */ unsigned long *filters[NR_BLOOM_FILTERS]; /* the mm stats for debugging */ unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; }; struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; /* max_seq from lru_gen_folio: can be out of date */ unsigned long seq; /* the next address within an mm to scan */ unsigned long next_addr; /* to batch promoted pages */ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* to batch the mm stats */ int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; int swappiness; bool force_scan; }; /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins * to improve scalability. For each bin, the hlist_nulls is virtually divided * into three segments: the head, the tail and the default. * * An onlining memcg is added to the tail of a random bin in the old generation. * The eviction starts at the head of a random bin in the old generation. The * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes * the old generation, is incremented when all its bins become empty. * * There are four operations: * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; * 3. The first attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_TAIL; * 4. The second attempt to reclaim a memcg offlined or below reclaimable size * threshold, which triggers MEMCG_LRU_YOUNG; * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. * * Notes: * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing * of their max_seq counters ensures the eventual fairness to all eligible * memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). * 2. There are only two valid generations: old (seq) and young (seq+1). * MEMCG_NR_GENS is set to three so that when reading the generation counter * locklessly, a stale value (seq-1) does not wraparound to young. */ #define MEMCG_NR_GENS 3 #define MEMCG_NR_BINS 8 struct lru_gen_memcg { /* the per-node memcg generation counter */ unsigned long seq; /* each memcg has one lru_gen_folio per node */ unsigned long nr_memcgs[MEMCG_NR_GENS]; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; /* protects the above */ spinlock_t lock; }; void lru_gen_init_pgdat(struct pglist_data *pgdat); void lru_gen_init_lruvec(struct lruvec *lruvec); bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) { } static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { return false; } static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } #endif /* CONFIG_LRU_GEN */ struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* per lruvec lru_lock for memcg */ spinlock_t lru_lock; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; #ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, WMARK_PROMO, NR_WMARK }; /* * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 2 #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) /* * Flags used in pcp->flags field. * * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the * previous page freeing. To avoid to drain PCP for an accident * high-order page freeing. * * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before * draining PCP for consecutive high-order pages freeing without * allocation if data cache slice of CPU is large enough. To reduce * zone lock contention and keep cache-hot pages reusing. */ #define PCPF_PREV_FREE_HIGH_ORDER BIT(0) #define PCPF_FREE_HIGH_BATCH BIT(1) struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int high_min; /* min high watermark */ int high_max; /* max high watermark */ int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ u8 alloc_factor; /* batch scaling factor during allocate */ #ifdef CONFIG_NUMA u8 expire; /* When 0, remote pagesets are drained */ #endif short free_count; /* consecutive free count */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; } ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; s8 stat_threshold; #endif #ifdef CONFIG_NUMA /* * Low priority inaccurate counters that are only folded * on demand. Use a large type to avoid the overhead of * folding during refresh_cpu_vm_stats. */ unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Therefore, we do not allow * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and * faulted, they come from the right zone right away. However, it is * still possible that address space already has pages in * ZONE_MOVABLE at the time when pages are pinned (i.e. user has * touches that memory before pinning). In such case we migrate them * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. * 7. Memory-hotplug: when using memmap_on_memory and onlining the * memory to the MOVABLE zone, the vmemmap pages are also placed in * such zone. Such pages cannot be really moved around as they are * self-stored in the range, but they are treated as movable when * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; unsigned long nr_free_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pages __percpu *per_cpu_pageset; struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access */ int pageset_high_min; int pageset_high_max; int pageset_batch; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * present_early_pages is present pages existing within the zone * located on memory available since early boot, excluding hotplugged * memory. * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * cma pages is present pages that are assigned for CMA use * (MIGRATE_CMA). * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/done(). Any reader who can't tolerant drift of * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; #if defined(CONFIG_MEMORY_HOTPLUG) unsigned long present_early_pages; #endif #ifdef CONFIG_CMA unsigned long cma_pages; #endif const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; /* To be called once the last page in the zone is accepted */ struct work_struct unaccepted_cleanup; #endif /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Pages to be freed when next trylock succeeds */ struct llist_head trylock_free_pages; /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; static inline unsigned long wmark_pages(const struct zone *z, enum zone_watermarks w) { return z->_watermark[w] + z->watermark_boost; } static inline unsigned long min_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_MIN); } static inline unsigned long low_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_LOW); } static inline unsigned long high_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_HIGH); } static inline unsigned long promo_wmark_pages(const struct zone *z) { return wmark_pages(z, WMARK_PROMO); } static inline unsigned long zone_managed_pages(const struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_cma_pages(struct zone *zone) { #ifdef CONFIG_CMA return zone->cma_pages; #else return 0; #endif } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(const struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(const struct zone *zone) { return zone->spanned_pages == 0; } #ifndef BUILD_VDSO32_64 /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) #define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) #define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) #define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ SECTIONS_PGOFF : ZONES_PGOFF) #else #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ NODES_PGOFF : ZONES_PGOFF) #endif #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags) { ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT); return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type page_zonenum(const struct page *page) { return memdesc_zonenum(page->flags); } static inline enum zone_type folio_zonenum(const struct folio *folio) { return memdesc_zonenum(folio->flags); } #ifdef CONFIG_ZONE_DEVICE static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { return memdesc_zonenum(mdf) == ZONE_DEVICE; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page); return page_folio(page)->pgmap; } /* * Consecutive zone device pages should not be merged into the same sgl * or bvec segment with other types of pages or if they belong to different * pgmaps. Otherwise getting the pgmap of a given segment is not possible * without scanning the entire segment. This helper returns true either if * both pages are not zone device pages or both pages are zone device pages * with the same pgmap. */ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags)) return false; if (!memdesc_is_zone_device(a->flags)) return true; return page_pgmap(a) == page_pgmap(b); } extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { return false; } static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { return true; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { return NULL; } #endif static inline bool is_zone_device_page(const struct page *page) { return memdesc_is_zone_device(page->flags); } static inline bool folio_is_zone_device(const struct folio *folio) { return memdesc_is_zone_device(folio->flags); } static inline bool is_zone_movable_page(const struct page *page) { return page_zonenum(page) == ZONE_MOVABLE; } static inline bool folio_is_zone_movable(const struct folio *folio) { return folio_zonenum(folio) == ZONE_MOVABLE; } #endif /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(const struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; /* * The array of struct pages for flatmem. * It must be declared for SPARSEMEM as well because there are configurations * that rely on that. */ extern struct page *mem_map; #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif #ifdef CONFIG_MEMORY_FAILURE /* * Per NUMA node memory failure handling statistics. */ struct memory_failure_stats { /* * Number of raw pages poisoned. * Cases not accounted: memory outside kernel control, offline page, * arch-specific memory_failure (SGX), hwpoison_filter() filtered * error events, and unpoison actions from hwpoison_unpoison. */ unsigned long total; /* * Recovery results of poisoned raw pages handled by memory_failure, * in sync with mf_result. * total = ignored + failed + delayed + recovered. * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. */ unsigned long ignored; unsigned long failed; unsigned long delayed; unsigned long recovered; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; /* workqueues for throttling reclaim for different reasons. */ wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ #ifdef CONFIG_MEMORY_HOTPLUG struct mutex kswapd_lock; #endif struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_NUMA_BALANCING /* start time in ms of current promote rate limit period */ unsigned int nbp_rl_start; /* number of promote candidate pages at start time of current rate limit period */ unsigned long nbp_rl_nr_cand; /* promote threshold in ms */ unsigned int nbp_threshold; /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif #ifdef CONFIG_MEMORY_FAILURE struct memory_failure_stats mf_stats; #endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; } #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. */ enum meminit_context { MEMINIT_EARLY, MEMINIT_HOTPLUG, }; extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG return lruvec->pgdat; #else return container_of(lruvec, struct pglist_data, __lruvec); #endif } #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else static inline int local_memory_node(int node_id) { return node_id; }; #endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE static inline bool zone_is_zone_device(const struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else static inline bool zone_is_zone_device(const struct zone *zone) { return false; } #endif /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ static inline bool managed_zone(const struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ static inline bool populated_zone(const struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA static inline int zone_to_nid(const struct zone *zone) { return zone->node; } static inline void zone_set_nid(struct zone *zone, int nid) { zone->node = nid; } #else static inline int zone_to_nid(const struct zone *zone) { return 0; } static inline void zone_set_nid(struct zone *zone, int nid) {} #endif extern int movable_zone; static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); #else return 0; #endif } /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ static inline int is_highmem(const struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } #ifdef CONFIG_ZONE_DMA bool has_managed_dma(void); #else static inline bool has_managed_dma(void) { return false; } #endif #ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; static inline struct pglist_data *NODE_DATA(int nid) { return &contig_page_data; } #else /* CONFIG_NUMA */ #include <asm/mmzone.h> #endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat: pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone: pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. */ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) #define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) \ if (!populated_zone(zone)) \ ; /* do nothing */ \ else static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; } static inline int zonelist_zone_idx(const struct zoneref *zoneref) { return zoneref->zone_idx; } static inline int zonelist_node_idx(const struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes); /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z: The cursor used as a starting point for the search * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. * * Return: the next zone at or below highest_zoneidx within the allowed * nodemask using a cursor within a zonelist as a starting point */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); } /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist: The zonelist to search for a suitable zone * @highest_zoneidx: The zone index of the highest zone to return * @nodes: An optional nodemask to filter the zonelist with * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. * * Return: Zoneref pointer for the first suitable zone found */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); } /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone: The current zone in the iterator * @z: The current pointer within zonelist->_zonerefs being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * @nodemask: Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone: The current zone in the iterator * @z: The current pointer within zonelist->zones being iterated * @zlist: The zonelist being iterated * @highidx: The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) /* Whether the 'nodes' are all movable nodes */ static inline bool movable_only_nodes(nodemask_t *nodes) { struct zonelist *zonelist; struct zoneref *z; int nid; if (nodes_empty(*nodes)) return false; /* * We can chose arbitrary node from the nodemask to get a * zonelist as they are interlinked. We just need to find * at least one zone that can satisfy kernel allocations. */ nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); return (!zonelist_zone(z)) ? true : false; } #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #endif #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif #ifdef CONFIG_SPARSEMEM /* * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) #if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) { return pfn >> PFN_SECTION_SHIFT; } static inline unsigned long section_nr_to_pfn(unsigned long sec) { return sec << PFN_SECTION_SHIFT; } #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) #if SUBSECTION_SHIFT > SECTION_SIZE_BITS #error Subsection size exceeds section size #else #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { struct rcu_head rcu; #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; void subsection_map_init(unsigned long pfn, unsigned long nr_pages); struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong. */ unsigned long section_mem_map; struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; unsigned long pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. */ }; #ifdef CONFIG_SPARSEMEM_EXTREME #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) #else #define SECTIONS_PER_ROOT 1 #endif #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline unsigned long *section_to_usemap(struct mem_section *ms) { return ms->usage->pageblock_flags; } static inline struct mem_section *__nr_to_section(unsigned long nr) { unsigned long root = SECTION_NR_TO_ROOT(nr); if (unlikely(root >= NR_SECTION_ROOTS)) return NULL; #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section || !mem_section[root]) return NULL; #endif return &mem_section[root][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available on all architectures. * However, we can exceed 6 bits on some other architectures except * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available * with the worst case of 64K pages on arm64) if we make sure the * exceeded bit is not applicable to powerpc. */ enum { SECTION_MARKED_PRESENT_BIT, SECTION_HAS_MEM_MAP_BIT, SECTION_IS_ONLINE_BIT, SECTION_IS_EARLY_BIT, #ifdef CONFIG_ZONE_DEVICE SECTION_TAINT_ZONE_DEVICE_BIT, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT SECTION_IS_VMEMMAP_PREINIT_BIT, #endif SECTION_MAP_LAST_BIT, }; #define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) #define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) #define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) #define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) #ifdef CONFIG_ZONE_DEVICE #define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT #define SECTION_IS_VMEMMAP_PREINIT BIT(SECTION_IS_VMEMMAP_PREINIT_BIT) #endif #define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) #define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { unsigned long map = section->section_mem_map; map &= SECTION_MAP_MASK; return (struct page *)map; } static inline int present_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int present_section_nr(unsigned long nr) { return present_section(__nr_to_section(nr)); } static inline int valid_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int early_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); } static inline int online_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(const struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else static inline int online_device_section(const struct mem_section *section) { return 0; } #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT static inline int preinited_vmemmap_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); } void sparse_vmemmap_init_nid_early(int nid); void sparse_vmemmap_init_nid_late(int nid); #else static inline int preinited_vmemmap_section(const struct mem_section *section) { return 0; } static inline void sparse_vmemmap_init_nid_early(int nid) { } static inline void sparse_vmemmap_init_nid_late(int nid) { } #endif static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); } #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } extern unsigned long __highest_present_section_nr; static inline int subsection_map_index(unsigned long pfn) { return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; } #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); struct mem_section_usage *usage = READ_ONCE(ms->usage); return usage ? test_bit(idx, usage->subsection_map) : 0; } static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn) { struct mem_section_usage *usage = READ_ONCE(ms->usage); int idx = subsection_map_index(*pfn); unsigned long bit; if (!usage) return false; if (test_bit(idx, usage->subsection_map)) return true; /* Find the next subsection that exists */ bit = find_next_bit(usage->subsection_map, SUBSECTIONS_PER_SECTION, idx); if (bit == SUBSECTIONS_PER_SECTION) return false; *pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION); return true; } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn) { return true; } #endif void sparse_init_early_section(int nid, struct page *map, unsigned long pnum, unsigned long flags); #ifndef CONFIG_HAVE_ARCH_PFN_VALID /** * pfn_valid - check if there is a valid memory map entry for a PFN * @pfn: the page frame number to check * * Check if there is a valid memory map entry aka struct page for the @pfn. * Note, that availability of the memory map entry does not imply that * there is actual usable memory at that @pfn. The struct page may * represent a hole or an unusable page frame. * * Return: 1 for PFNs that have memory map entries and 0 otherwise */ static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; int ret; /* * Ensure the upper PAGE_SHIFT bits are clear in the * pfn. Else it might lead to false positives when * some of the upper bits are set, but the lower bits * match a valid pfn. */ if (PHYS_PFN(PFN_PHYS(pfn)) != pfn) return 0; if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); rcu_read_lock_sched(); if (!valid_section(ms)) { rcu_read_unlock_sched(); return 0; } /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ ret = early_section(ms) || pfn_section_valid(ms, pfn); rcu_read_unlock_sched(); return ret; } /* Returns end_pfn or higher if no valid PFN remaining in range */ static inline unsigned long first_valid_pfn(unsigned long pfn, unsigned long end_pfn) { unsigned long nr = pfn_to_section_nr(pfn); rcu_read_lock_sched(); while (nr <= __highest_present_section_nr && pfn < end_pfn) { struct mem_section *ms = __pfn_to_section(pfn); if (valid_section(ms) && (early_section(ms) || pfn_section_first_valid(ms, &pfn))) { rcu_read_unlock_sched(); return pfn; } /* Nothing left in this section? Skip to next section */ nr++; pfn = section_nr_to_pfn(nr); } rcu_read_unlock_sched(); return end_pfn; } static inline unsigned long next_valid_pfn(unsigned long pfn, unsigned long end_pfn) { pfn++; if (pfn >= end_pfn) return end_pfn; /* * Either every PFN within the section (or subsection for VMEMMAP) is * valid, or none of them are. So there's no point repeating the check * for every PFN; only call first_valid_pfn() again when crossing a * (sub)section boundary (i.e. !(pfn & ~PAGE_{SUB,}SECTION_MASK)). */ if (pfn & ~(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) ? PAGE_SUBSECTION_MASK : PAGE_SECTION_MASK)) return pfn; return first_valid_pfn(pfn, end_pfn); } #define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \ for ((_pfn) = first_valid_pfn((_start_pfn), (_end_pfn)); \ (_pfn) < (_end_pfn); \ (_pfn) = next_valid_pfn((_pfn), (_end_pfn))) #endif static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) { while (++section_nr <= __highest_present_section_nr) { if (present_section_nr(section_nr)) return section_nr; } return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start - 1); \ section_nr != -1; \ section_nr = next_present_section_nr(section_nr)) /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate * this restriction. */ #ifdef CONFIG_NUMA #define pfn_to_nid(pfn) \ ({ \ unsigned long __pfn_to_nid_pfn = (pfn); \ page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ }) #else #define pfn_to_nid(pfn) (0) #endif void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ /* * Fallback case for when the architecture provides its own pfn_valid() but * not a corresponding for_each_valid_pfn(). */ #ifndef for_each_valid_pfn #define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \ for ((_pfn) = (_start_pfn); (_pfn) < (_end_pfn); (_pfn)++) \ if (pfn_valid(_pfn)) #endif #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */
2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 // SPDX-License-Identifier: GPL-2.0 /* * net/tipc/crypto.c: TIPC crypto for key handling & packet en/decryption * * Copyright (c) 2019, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <crypto/aead.h> #include <crypto/aes.h> #include <crypto/rng.h> #include "crypto.h" #include "msg.h" #include "bcast.h" #define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */ #define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */ #define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ #define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */ #define TIPC_MAX_TFMS_DEF 10 #define TIPC_MAX_TFMS_LIM 1000 #define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ /* * TIPC Key ids */ enum { KEY_MASTER = 0, KEY_MIN = KEY_MASTER, KEY_1 = 1, KEY_2, KEY_3, KEY_MAX = KEY_3, }; /* * TIPC Crypto statistics */ enum { STAT_OK, STAT_NOK, STAT_ASYNC, STAT_ASYNC_OK, STAT_ASYNC_NOK, STAT_BADKEYS, /* tx only */ STAT_BADMSGS = STAT_BADKEYS, /* rx only */ STAT_NOKEYS, STAT_SWITCHES, MAX_STATS, }; /* TIPC crypto statistics' header */ static const char *hstats[MAX_STATS] = {"ok", "nok", "async", "async_ok", "async_nok", "badmsgs", "nokeys", "switches"}; /* Max TFMs number per key */ int sysctl_tipc_max_tfms __read_mostly = TIPC_MAX_TFMS_DEF; /* Key exchange switch, default: on */ int sysctl_tipc_key_exchange_enabled __read_mostly = 1; /* * struct tipc_key - TIPC keys' status indicator * * 7 6 5 4 3 2 1 0 * +-----+-----+-----+-----+-----+-----+-----+-----+ * key: | (reserved)|passive idx| active idx|pending idx| * +-----+-----+-----+-----+-----+-----+-----+-----+ */ struct tipc_key { #define KEY_BITS (2) #define KEY_MASK ((1 << KEY_BITS) - 1) union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 pending:2, active:2, passive:2, /* rx only */ reserved:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:2, passive:2, /* rx only */ active:2, pending:2; #else #error "Please fix <asm/byteorder.h>" #endif } __packed; u8 keys; }; }; /** * struct tipc_tfm - TIPC TFM structure to form a list of TFMs * @tfm: cipher handle/key * @list: linked list of TFMs */ struct tipc_tfm { struct crypto_aead *tfm; struct list_head list; }; /** * struct tipc_aead - TIPC AEAD key structure * @tfm_entry: per-cpu pointer to one entry in TFM list * @crypto: TIPC crypto owns this key * @cloned: reference to the source key in case cloning * @users: the number of the key users (TX/RX) * @salt: the key's SALT value * @authsize: authentication tag size (max = 16) * @mode: crypto mode is applied to the key * @hint: a hint for user key * @rcu: struct rcu_head * @key: the aead key * @gen: the key's generation * @seqno: the key seqno (cluster scope) * @refcnt: the key reference counter */ struct tipc_aead { #define TIPC_AEAD_HINT_LEN (5) struct tipc_tfm * __percpu *tfm_entry; struct tipc_crypto *crypto; struct tipc_aead *cloned; atomic_t users; u32 salt; u8 authsize; u8 mode; char hint[2 * TIPC_AEAD_HINT_LEN + 1]; struct rcu_head rcu; struct tipc_aead_key *key; u16 gen; atomic64_t seqno ____cacheline_aligned; refcount_t refcnt ____cacheline_aligned; } ____cacheline_aligned; /** * struct tipc_crypto_stats - TIPC Crypto statistics * @stat: array of crypto statistics */ struct tipc_crypto_stats { unsigned int stat[MAX_STATS]; }; /** * struct tipc_crypto - TIPC TX/RX crypto structure * @net: struct net * @node: TIPC node (RX) * @aead: array of pointers to AEAD keys for encryption/decryption * @peer_rx_active: replicated peer RX active key index * @key_gen: TX/RX key generation * @key: the key states * @skey_mode: session key's mode * @skey: received session key * @wq: common workqueue on TX crypto * @work: delayed work sched for TX/RX * @key_distr: key distributing state * @rekeying_intv: rekeying interval (in minutes) * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) * @timer1: general timer 1 (jiffies) * @timer2: general timer 2 (jiffies) * @working: the crypto is working or not * @key_master: flag indicates if master key exists * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @nokey: no key indication * @flags: combined flags field * @lock: tipc_key lock */ struct tipc_crypto { struct net *net; struct tipc_node *node; struct tipc_aead __rcu *aead[KEY_MAX + 1]; atomic_t peer_rx_active; u16 key_gen; struct tipc_key key; u8 skey_mode; struct tipc_aead_key *skey; struct workqueue_struct *wq; struct delayed_work work; #define KEY_DISTR_SCHED 1 #define KEY_DISTR_COMPL 2 atomic_t key_distr; u32 rekeying_intv; struct tipc_crypto_stats __percpu *stats; char name[48]; atomic64_t sndnxt ____cacheline_aligned; unsigned long timer1; unsigned long timer2; union { struct { u8 working:1; u8 key_master:1; u8 legacy_user:1; u8 nokey: 1; }; u8 flags; }; spinlock_t lock; /* crypto lock */ } ____cacheline_aligned; /* struct tipc_crypto_tx_ctx - TX context for callbacks */ struct tipc_crypto_tx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; struct tipc_media_addr dst; }; /* struct tipc_crypto_rx_ctx - RX context for callbacks */ struct tipc_crypto_rx_ctx { struct tipc_aead *aead; struct tipc_bearer *bearer; }; static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead); static inline void tipc_aead_put(struct tipc_aead *aead); static void tipc_aead_free(struct rcu_head *rp); static int tipc_aead_users(struct tipc_aead __rcu *aead); static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim); static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val); static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead); static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode); static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src); static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg); static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode); static void tipc_aead_encrypt_done(void *data, int err); static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b); static void tipc_aead_decrypt_done(void *data, int err); static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr); static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx); static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending); static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key); static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key); static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb); static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type); static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err); static void tipc_crypto_do_cmd(struct net *net, int cmd); static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf); static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf); static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode); static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr); static void tipc_crypto_work_tx(struct work_struct *work); static void tipc_crypto_work_rx(struct work_struct *work); static int tipc_aead_key_generate(struct tipc_aead_key *skey); #define is_tx(crypto) (!(crypto)->node) #define is_rx(crypto) (!is_tx(crypto)) #define key_next(cur) ((cur) % KEY_MAX + 1) #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ do { \ struct tipc_aead *__tmp = rcu_dereference_protected((rcu_ptr), \ lockdep_is_held(lock)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ tipc_aead_put(__tmp); \ } while (0) #define tipc_crypto_key_detach(rcu_ptr, lock) \ tipc_aead_rcu_replace((rcu_ptr), NULL, lock) /** * tipc_aead_key_validate - Validate a AEAD user key * @ukey: pointer to user key data * @info: netlink info pointer */ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) { int keylen; /* Check if algorithm exists */ if (unlikely(!crypto_has_alg(ukey->alg_name, 0, 0))) { GENL_SET_ERR_MSG(info, "unable to load the algorithm (module existed?)"); return -ENODEV; } /* Currently, we only support the "gcm(aes)" cipher algorithm */ if (strcmp(ukey->alg_name, "gcm(aes)")) { GENL_SET_ERR_MSG(info, "not supported yet the algorithm"); return -ENOTSUPP; } /* Check if key size is correct */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; if (unlikely(keylen != TIPC_AES_GCM_KEY_SIZE_128 && keylen != TIPC_AES_GCM_KEY_SIZE_192 && keylen != TIPC_AES_GCM_KEY_SIZE_256)) { GENL_SET_ERR_MSG(info, "incorrect key length (20, 28 or 36 octets?)"); return -EKEYREJECTED; } return 0; } /** * tipc_aead_key_generate - Generate new session key * @skey: input/output key with new content * * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_key_generate(struct tipc_aead_key *skey) { int rc = 0; /* Fill the key's content with a random value via RNG cipher */ rc = crypto_get_default_rng(); if (likely(!rc)) { rc = crypto_rng_get_bytes(crypto_default_rng, skey->key, skey->keylen); crypto_put_default_rng(); } return rc; } static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (unlikely(!tmp || !refcount_inc_not_zero(&tmp->refcnt))) tmp = NULL; rcu_read_unlock(); return tmp; } static inline void tipc_aead_put(struct tipc_aead *aead) { if (aead && refcount_dec_and_test(&aead->refcnt)) call_rcu(&aead->rcu, tipc_aead_free); } /** * tipc_aead_free - Release AEAD key incl. all the TFMs in the list * @rp: rcu head pointer */ static void tipc_aead_free(struct rcu_head *rp) { struct tipc_aead *aead = container_of(rp, struct tipc_aead, rcu); struct tipc_tfm *tfm_entry, *head, *tmp; if (aead->cloned) { tipc_aead_put(aead->cloned); } else { head = *get_cpu_ptr(aead->tfm_entry); put_cpu_ptr(aead->tfm_entry); list_for_each_entry_safe(tfm_entry, tmp, &head->list, list) { crypto_free_aead(tfm_entry->tfm); list_del(&tfm_entry->list); kfree(tfm_entry); } /* Free the head */ crypto_free_aead(head->tfm); list_del(&head->list); kfree(head); } free_percpu(aead->tfm_entry); kfree_sensitive(aead->key); kfree_sensitive(aead); } static int tipc_aead_users(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; int users = 0; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) users = atomic_read(&tmp->users); rcu_read_unlock(); return users; } static void tipc_aead_users_inc(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&tmp->users, 1, lim); rcu_read_unlock(); } static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) { struct tipc_aead *tmp; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); rcu_read_unlock(); } static void tipc_aead_users_set(struct tipc_aead __rcu *aead, int val) { struct tipc_aead *tmp; int cur; rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) { do { cur = atomic_read(&tmp->users); if (cur == val) break; } while (atomic_cmpxchg(&tmp->users, cur, val) != cur); } rcu_read_unlock(); } /** * tipc_aead_tfm_next - Move TFM entry to the next one in list and return it * @aead: the AEAD key pointer */ static struct crypto_aead *tipc_aead_tfm_next(struct tipc_aead *aead) { struct tipc_tfm **tfm_entry; struct crypto_aead *tfm; tfm_entry = get_cpu_ptr(aead->tfm_entry); *tfm_entry = list_next_entry(*tfm_entry, list); tfm = (*tfm_entry)->tfm; put_cpu_ptr(tfm_entry); return tfm; } /** * tipc_aead_init - Initiate TIPC AEAD * @aead: returned new TIPC AEAD key handle pointer * @ukey: pointer to user key data * @mode: the key mode * * Allocate a (list of) new cipher transformation (TFM) with the specific user * key data if valid. The number of the allocated TFMs can be set via the sysfs * "net/tipc/max_tfms" first. * Also, all the other AEAD data are also initialized. * * Return: 0 if the initiation is successful, otherwise: < 0 */ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, u8 mode) { struct tipc_tfm *tfm_entry, *head; struct crypto_aead *tfm; struct tipc_aead *tmp; int keylen, err, cpu; int tfm_cnt = 0; if (unlikely(*aead)) return -EEXIST; /* Allocate a new AEAD */ tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (unlikely(!tmp)) return -ENOMEM; /* The key consists of two parts: [AES-KEY][SALT] */ keylen = ukey->keylen - TIPC_AES_GCM_SALT_SIZE; /* Allocate per-cpu TFM entry pointer */ tmp->tfm_entry = alloc_percpu(struct tipc_tfm *); if (!tmp->tfm_entry) { kfree_sensitive(tmp); return -ENOMEM; } /* Make a list of TFMs with the user key data */ do { tfm = crypto_alloc_aead(ukey->alg_name, 0, 0); if (IS_ERR(tfm)) { err = PTR_ERR(tfm); break; } if (unlikely(!tfm_cnt && crypto_aead_ivsize(tfm) != TIPC_AES_GCM_IV_SIZE)) { crypto_free_aead(tfm); err = -ENOTSUPP; break; } err = crypto_aead_setauthsize(tfm, TIPC_AES_GCM_TAG_SIZE); err |= crypto_aead_setkey(tfm, ukey->key, keylen); if (unlikely(err)) { crypto_free_aead(tfm); break; } tfm_entry = kmalloc(sizeof(*tfm_entry), GFP_KERNEL); if (unlikely(!tfm_entry)) { crypto_free_aead(tfm); err = -ENOMEM; break; } INIT_LIST_HEAD(&tfm_entry->list); tfm_entry->tfm = tfm; /* First entry? */ if (!tfm_cnt) { head = tfm_entry; for_each_possible_cpu(cpu) { *per_cpu_ptr(tmp->tfm_entry, cpu) = head; } } else { list_add_tail(&tfm_entry->list, &head->list); } } while (++tfm_cnt < sysctl_tipc_max_tfms); /* Not any TFM is allocated? */ if (!tfm_cnt) { free_percpu(tmp->tfm_entry); kfree_sensitive(tmp); return err; } /* Form a hex string of some last bytes as the key's hint */ bin2hex(tmp->hint, ukey->key + keylen - TIPC_AEAD_HINT_LEN, TIPC_AEAD_HINT_LEN); /* Initialize the other data */ tmp->mode = mode; tmp->cloned = NULL; tmp->authsize = TIPC_AES_GCM_TAG_SIZE; tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL); if (!tmp->key) { tipc_aead_free(&tmp->rcu); return -ENOMEM; } memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); atomic_set(&tmp->users, 0); atomic64_set(&tmp->seqno, 0); refcount_set(&tmp->refcnt, 1); *aead = tmp; return 0; } /** * tipc_aead_clone - Clone a TIPC AEAD key * @dst: dest key for the cloning * @src: source key to clone from * * Make a "copy" of the source AEAD key data to the dest, the TFMs list is * common for the keys. * A reference to the source is hold in the "cloned" pointer for the later * freeing purposes. * * Note: this must be done in cluster-key mode only! * Return: 0 in case of success, otherwise < 0 */ static int tipc_aead_clone(struct tipc_aead **dst, struct tipc_aead *src) { struct tipc_aead *aead; int cpu; if (!src) return -ENOKEY; if (src->mode != CLUSTER_KEY) return -EINVAL; if (unlikely(*dst)) return -EEXIST; aead = kzalloc(sizeof(*aead), GFP_ATOMIC); if (unlikely(!aead)) return -ENOMEM; aead->tfm_entry = alloc_percpu_gfp(struct tipc_tfm *, GFP_ATOMIC); if (unlikely(!aead->tfm_entry)) { kfree_sensitive(aead); return -ENOMEM; } for_each_possible_cpu(cpu) { *per_cpu_ptr(aead->tfm_entry, cpu) = *per_cpu_ptr(src->tfm_entry, cpu); } memcpy(aead->hint, src->hint, sizeof(src->hint)); aead->mode = src->mode; aead->salt = src->salt; aead->authsize = src->authsize; atomic_set(&aead->users, 0); atomic64_set(&aead->seqno, 0); refcount_set(&aead->refcnt, 1); WARN_ON(!refcount_inc_not_zero(&src->refcnt)); aead->cloned = src; *dst = aead; return 0; } /** * tipc_aead_mem_alloc - Allocate memory for AEAD request operations * @tfm: cipher handle to be registered with the request * @crypto_ctx_size: size of crypto context for callback * @iv: returned pointer to IV data * @req: returned pointer to AEAD request data * @sg: returned pointer to SG lists * @nsg: number of SG lists to be allocated * * Allocate memory to store the crypto context data, AEAD request, IV and SG * lists, the memory layout is as follows: * crypto_ctx || iv || aead_req || sg[] * * Return: the pointer to the memory areas in case of success, otherwise NULL */ static void *tipc_aead_mem_alloc(struct crypto_aead *tfm, unsigned int crypto_ctx_size, u8 **iv, struct aead_request **req, struct scatterlist **sg, int nsg) { unsigned int iv_size, req_size; unsigned int len; u8 *mem; iv_size = crypto_aead_ivsize(tfm); req_size = sizeof(**req) + crypto_aead_reqsize(tfm); len = crypto_ctx_size; len += iv_size; len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); len += nsg * sizeof(**sg); mem = kmalloc(len, GFP_ATOMIC); if (!mem) return NULL; *iv = (u8 *)PTR_ALIGN(mem + crypto_ctx_size, crypto_aead_alignmask(tfm) + 1); *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size, crypto_tfm_ctx_alignment()); *sg = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size, __alignof__(struct scatterlist)); return (void *)mem; } /** * tipc_aead_encrypt - Encrypt a message * @aead: TIPC AEAD key for the message encryption * @skb: the input/output skb * @b: TIPC bearer where the message will be delivered after the encryption * @dst: the destination media address * @__dnode: TIPC dest node if "known" * * Return: * * 0 : if the encryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the encryption has failed */ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct crypto_aead *tfm = tipc_aead_tfm_next(aead); struct tipc_crypto_tx_ctx *tx_ctx; struct aead_request *req; struct sk_buff *trailer; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, len, tailen, nsg, rc; void *ctx; u32 salt; u8 *iv; /* Make sure message len at least 4-byte aligned */ len = ALIGN(skb->len, 4); tailen = len - skb->len + aead->authsize; /* Expand skb tail for authentication tag: * As for simplicity, we'd have made sure skb having enough tailroom * for authentication tag @skb allocation. Even when skb is nonlinear * but there is no frag_list, it should be still fine! * Otherwise, we must cow it to be a writable buffer with the tailroom. */ SKB_LINEAR_ASSERT(skb); if (tailen > skb_tailroom(skb)) { pr_debug("TX(): skb tailroom is not enough: %d, requires: %d\n", skb_tailroom(skb), tailen); } nsg = skb_cow_data(skb, tailen, &trailer); if (unlikely(nsg < 0)) { pr_err("TX: skb_cow_data() returned %d\n", nsg); return nsg; } pskb_put(skb, trailer, tailen); /* Allocate memory for the AEAD operation */ ctx = tipc_aead_mem_alloc(tfm, sizeof(*tx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("TX: skb_to_sgvec() returned %d, nsg %d!\n", rc, nsg); goto exit; } /* Prepare IV: [SALT (4 octets)][SEQNO (8 octets)] * In case we're in cluster-key mode, SALT is varied by xor-ing with * the source address (or w0 of id), otherwise with the dest address * if dest is known. */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (__dnode) salt ^= tipc_node_get_addr(__dnode); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_encrypt_done, skb); tx_ctx = (struct tipc_crypto_tx_ctx *)ctx; tx_ctx->aead = aead; tx_ctx->bearer = b; memcpy(&tx_ctx->dst, dst, sizeof(*dst)); /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Get net to avoid freed tipc_crypto when delete namespace */ if (!maybe_get_net(aead->crypto->net)) { tipc_bearer_put(b); rc = -ENODEV; goto exit; } /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); put_net(aead->crypto->net); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_encrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_tx_ctx *tx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = tx_ctx->bearer; struct tipc_aead *aead = tx_ctx->aead; struct tipc_crypto *tx = aead->crypto; struct net *net = tx->net; switch (err) { case 0: this_cpu_inc(tx->stats->stat[STAT_ASYNC_OK]); rcu_read_lock(); if (likely(test_bit(0, &b->up))) b->media->send_msg(net, skb, b, &tx_ctx->dst); else kfree_skb(skb); rcu_read_unlock(); break; case -EINPROGRESS: return; default: this_cpu_inc(tx->stats->stat[STAT_ASYNC_NOK]); kfree_skb(skb); break; } kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); put_net(net); } /** * tipc_aead_decrypt - Decrypt an encrypted message * @net: struct net * @aead: TIPC AEAD for the message decryption * @skb: the input/output skb * @b: TIPC bearer where the message has been received * * Return: * * 0 : if the decryption has completed * * -EINPROGRESS/-EBUSY : if a callback will be performed * * < 0 : the decryption has failed */ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_crypto_rx_ctx *rx_ctx; struct aead_request *req; struct crypto_aead *tfm; struct sk_buff *unused; struct scatterlist *sg; struct tipc_ehdr *ehdr; int ehsz, nsg, rc; void *ctx; u32 salt; u8 *iv; if (unlikely(!aead)) return -ENOKEY; nsg = skb_cow_data(skb, 0, &unused); if (unlikely(nsg < 0)) { pr_err("RX: skb_cow_data() returned %d\n", nsg); return nsg; } /* Allocate memory for the AEAD operation */ tfm = tipc_aead_tfm_next(aead); ctx = tipc_aead_mem_alloc(tfm, sizeof(*rx_ctx), &iv, &req, &sg, nsg); if (unlikely(!ctx)) return -ENOMEM; TIPC_SKB_CB(skb)->crypto_ctx = ctx; /* Map skb to the sg lists */ sg_init_table(sg, nsg); rc = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(rc < 0)) { pr_err("RX: skb_to_sgvec() returned %d, nsg %d\n", rc, nsg); goto exit; } /* Reconstruct IV: */ ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) salt ^= __be32_to_cpu(ehdr->addr); else if (ehdr->destined) salt ^= tipc_own_addr(net); memcpy(iv, &salt, 4); memcpy(iv + 4, (u8 *)&ehdr->seqno, 8); /* Prepare request */ ehsz = tipc_ehdr_size(ehdr); aead_request_set_tfm(req, tfm); aead_request_set_ad(req, ehsz); aead_request_set_crypt(req, sg, sg, skb->len - ehsz, iv); /* Set callback function & data */ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tipc_aead_decrypt_done, skb); rx_ctx = (struct tipc_crypto_rx_ctx *)ctx; rx_ctx->aead = aead; rx_ctx->bearer = b; /* Hold bearer */ if (unlikely(!tipc_bearer_hold(b))) { rc = -ENODEV; goto exit; } /* Now, do decrypt */ rc = crypto_aead_decrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); exit: kfree(ctx); TIPC_SKB_CB(skb)->crypto_ctx = NULL; return rc; } static void tipc_aead_decrypt_done(void *data, int err) { struct sk_buff *skb = data; struct tipc_crypto_rx_ctx *rx_ctx = TIPC_SKB_CB(skb)->crypto_ctx; struct tipc_bearer *b = rx_ctx->bearer; struct tipc_aead *aead = rx_ctx->aead; struct tipc_crypto_stats __percpu *stats = aead->crypto->stats; struct net *net = aead->crypto->net; switch (err) { case 0: this_cpu_inc(stats->stat[STAT_ASYNC_OK]); break; case -EINPROGRESS: return; default: this_cpu_inc(stats->stat[STAT_ASYNC_NOK]); break; } kfree(rx_ctx); tipc_crypto_rcv_complete(net, aead, b, &skb, err); if (likely(skb)) { if (likely(test_bit(0, &b->up))) tipc_rcv(net, skb, b); else kfree_skb(skb); } tipc_bearer_put(b); } static inline int tipc_ehdr_size(struct tipc_ehdr *ehdr) { return (ehdr->user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; } /** * tipc_ehdr_validate - Validate an encryption message * @skb: the message buffer * * Return: "true" if this is a valid encryption message, otherwise "false" */ bool tipc_ehdr_validate(struct sk_buff *skb) { struct tipc_ehdr *ehdr; int ehsz; if (unlikely(!pskb_may_pull(skb, EHDR_MIN_SIZE))) return false; ehdr = (struct tipc_ehdr *)skb->data; if (unlikely(ehdr->version != TIPC_EVERSION)) return false; ehsz = tipc_ehdr_size(ehdr); if (unlikely(!pskb_may_pull(skb, ehsz))) return false; if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) return false; return true; } /** * tipc_ehdr_build - Build TIPC encryption message header * @net: struct net * @aead: TX AEAD key to be used for the message encryption * @tx_key: key id used for the message encryption * @skb: input/output message skb * @__rx: RX crypto handle if dest is "known" * * Return: the header size if the building is successful, otherwise < 0 */ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, u8 tx_key, struct sk_buff *skb, struct tipc_crypto *__rx) { struct tipc_msg *hdr = buf_msg(skb); struct tipc_ehdr *ehdr; u32 user = msg_user(hdr); u64 seqno; int ehsz; /* Make room for encryption header */ ehsz = (user != LINK_CONFIG) ? EHDR_SIZE : EHDR_CFG_SIZE; WARN_ON(skb_headroom(skb) < ehsz); ehdr = (struct tipc_ehdr *)skb_push(skb, ehsz); /* Obtain a seqno first: * Use the key seqno (= cluster wise) if dest is unknown or we're in * cluster key mode, otherwise it's better for a per-peer seqno! */ if (!__rx || aead->mode == CLUSTER_KEY) seqno = atomic64_inc_return(&aead->seqno); else seqno = atomic64_inc_return(&__rx->sndnxt); /* Revoke the key if seqno is wrapped around */ if (unlikely(!seqno)) return tipc_crypto_key_revoke(net, tx_key); /* Word 1-2 */ ehdr->seqno = cpu_to_be64(seqno); /* Words 0, 3- */ ehdr->version = TIPC_EVERSION; ehdr->user = 0; ehdr->keepalive = 0; ehdr->tx_key = tx_key; ehdr->destined = (__rx) ? 1 : 0; ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; ehdr->rx_nokey = (__rx) ? __rx->nokey : 0; ehdr->master_key = aead->crypto->key_master; ehdr->reserved_1 = 0; ehdr->reserved_2 = 0; switch (user) { case LINK_CONFIG: ehdr->user = LINK_CONFIG; memcpy(ehdr->id, tipc_own_id(net), NODE_ID_LEN); break; default: if (user == LINK_PROTOCOL && msg_type(hdr) == STATE_MSG) { ehdr->user = LINK_PROTOCOL; ehdr->keepalive = msg_is_keepalive(hdr); } ehdr->addr = hdr->hdr[3]; break; } return ehsz; } static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_passive, u8 new_active, u8 new_pending) { struct tipc_key old = c->key; char buf[32]; c->key.keys = ((new_passive & KEY_MASK) << (KEY_BITS * 2)) | ((new_active & KEY_MASK) << (KEY_BITS)) | ((new_pending & KEY_MASK)); pr_debug("%s: key changing %s ::%pS\n", c->name, tipc_key_change_dump(old, c->key, buf), __builtin_return_address(0)); } /** * tipc_crypto_key_init - Initiate a new user / AEAD key * @c: TIPC crypto to which new key is attached * @ukey: the user key * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) * @master_key: specify this is a cluster master key * * A new TIPC AEAD key will be allocated and initiated with the specified user * key, then attached to the TIPC crypto. * * Return: new key id in case of success, otherwise: < 0 */ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, u8 mode, bool master_key) { struct tipc_aead *aead = NULL; int rc = 0; /* Initiate with the new user key */ rc = tipc_aead_init(&aead, ukey, mode); /* Attach it to the crypto */ if (likely(!rc)) { rc = tipc_crypto_key_attach(c, aead, 0, master_key); if (rc < 0) tipc_aead_free(&aead->rcu); } return rc; } /** * tipc_crypto_key_attach - Attach a new AEAD key to TIPC crypto * @c: TIPC crypto to which the new AEAD key is attached * @aead: the new AEAD key pointer * @pos: desired slot in the crypto key array, = 0 if any! * @master_key: specify this is a cluster master key * * Return: new key id in case of success, otherwise: -EBUSY */ static int tipc_crypto_key_attach(struct tipc_crypto *c, struct tipc_aead *aead, u8 pos, bool master_key) { struct tipc_key key; int rc = -EBUSY; u8 new_key; spin_lock_bh(&c->lock); key = c->key; if (master_key) { new_key = KEY_MASTER; goto attach; } if (key.active && key.passive) goto exit; if (key.pending) { if (tipc_aead_users(c->aead[key.pending]) > 0) goto exit; /* if (pos): ok with replacing, will be aligned when needed */ /* Replace it */ new_key = key.pending; } else { if (pos) { if (key.active && pos != key_next(key.active)) { key.passive = pos; new_key = pos; goto attach; } else if (!key.active && !key.passive) { key.pending = pos; new_key = pos; goto attach; } } key.pending = key_next(key.active ?: key.passive); new_key = key.pending; } attach: aead->crypto = c; aead->gen = (is_tx(c)) ? ++c->key_gen : c->key_gen; tipc_aead_rcu_replace(c->aead[new_key], aead, &c->lock); if (likely(c->key.keys != key.keys)) tipc_crypto_key_set_state(c, key.passive, key.active, key.pending); c->working = 1; c->nokey = 0; c->key_master |= master_key; rc = new_key; exit: spin_unlock_bh(&c->lock); return rc; } void tipc_crypto_key_flush(struct tipc_crypto *c) { struct tipc_crypto *tx, *rx; int k; spin_lock_bh(&c->lock); if (is_rx(c)) { /* Try to cancel pending work */ rx = c; tx = tipc_net(rx->net)->crypto_tx; if (cancel_delayed_work(&rx->work)) { kfree(rx->skey); rx->skey = NULL; atomic_xchg(&rx->key_distr, 0); tipc_node_put(rx->node); } /* RX stopping => decrease TX key users if any */ k = atomic_xchg(&rx->peer_rx_active, 0); if (k) { tipc_aead_users_dec(tx->aead[k], 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; } } c->flags = 0; tipc_crypto_key_set_state(c, 0, 0, 0); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_crypto_key_detach(c->aead[k], &c->lock); atomic64_set(&c->sndnxt, 0); spin_unlock_bh(&c->lock); } /** * tipc_crypto_key_try_align - Align RX keys if possible * @rx: RX crypto handle * @new_pending: new pending slot if aligned (= TX key from peer) * * Peer has used an unknown key slot, this only happens when peer has left and * rejoned, or we are newcomer. * That means, there must be no active key but a pending key at unaligned slot. * If so, we try to move the pending key to the new slot. * Note: A potential passive key can exist, it will be shifted correspondingly! * * Return: "true" if key is successfully aligned, otherwise "false" */ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) { struct tipc_aead *tmp1, *tmp2 = NULL; struct tipc_key key; bool aligned = false; u8 new_passive = 0; int x; spin_lock(&rx->lock); key = rx->key; if (key.pending == new_pending) { aligned = true; goto exit; } if (key.active) goto exit; if (!key.pending) goto exit; if (tipc_aead_users(rx->aead[key.pending]) > 0) goto exit; /* Try to "isolate" this pending key first */ tmp1 = tipc_aead_rcu_ptr(rx->aead[key.pending], &rx->lock); if (!refcount_dec_if_one(&tmp1->refcnt)) goto exit; rcu_assign_pointer(rx->aead[key.pending], NULL); /* Move passive key if any */ if (key.passive) { tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock)); x = (key.passive - key.pending + new_pending) % KEY_MAX; new_passive = (x <= 0) ? x + KEY_MAX : x; } /* Re-allocate the key(s) */ tipc_crypto_key_set_state(rx, new_passive, 0, new_pending); rcu_assign_pointer(rx->aead[new_pending], tmp1); if (new_passive) rcu_assign_pointer(rx->aead[new_passive], tmp2); refcount_set(&tmp1->refcnt, 1); aligned = true; pr_info_ratelimited("%s: key[%d] -> key[%d]\n", rx->name, key.pending, new_pending); exit: spin_unlock(&rx->lock); return aligned; } /** * tipc_crypto_key_pick_tx - Pick one TX key for message decryption * @tx: TX crypto handle * @rx: RX crypto handle (can be NULL) * @skb: the message skb which will be decrypted later * @tx_key: peer TX key id * * This function looks up the existing TX keys and pick one which is suitable * for the message decryption, that must be a cluster key and not used before * on the same message (i.e. recursive). * * Return: the TX AEAD key handle in case of success, otherwise NULL */ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, struct sk_buff *skb, u8 tx_key) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); struct tipc_aead *aead = NULL; struct tipc_key key = tx->key; u8 k, i = 0; /* Initialize data if not yet */ if (!skb_cb->tx_clone_deferred) { skb_cb->tx_clone_deferred = 1; memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); } skb_cb->tx_clone_ctx.rx = rx; if (++skb_cb->tx_clone_ctx.recurs > 2) return NULL; /* Pick one TX key */ spin_lock(&tx->lock); if (tx_key == KEY_MASTER) { aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock); goto done; } do { k = (i == 0) ? key.pending : ((i == 1) ? key.active : key.passive); if (!k) continue; aead = tipc_aead_rcu_ptr(tx->aead[k], &tx->lock); if (!aead) continue; if (aead->mode != CLUSTER_KEY || aead == skb_cb->tx_clone_ctx.last) { aead = NULL; continue; } /* Ok, found one cluster key */ skb_cb->tx_clone_ctx.last = aead; WARN_ON(skb->next); skb->next = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb->next)) pr_warn("Failed to clone skb for next round if any\n"); break; } while (++i < 3); done: if (likely(aead)) WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); spin_unlock(&tx->lock); return aead; } /** * tipc_crypto_key_synch: Synch own key data according to peer key status * @rx: RX crypto handle * @skb: TIPCv2 message buffer (incl. the ehdr from peer) * * This function updates the peer node related data as the peer RX active key * has changed, so the number of TX keys' users on this node are increased and * decreased correspondingly. * * It also considers if peer has no key, then we need to make own master key * (if any) taking over i.e. starting grace period and also trigger key * distributing process. * * The "per-peer" sndnxt is also reset when the peer key has switched. */ static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) { struct tipc_ehdr *ehdr = (struct tipc_ehdr *)skb_network_header(skb); struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_msg *hdr = buf_msg(skb); u32 self = tipc_own_addr(rx->net); u8 cur, new; unsigned long delay; /* Update RX 'key_master' flag according to peer, also mark "legacy" if * a peer has no master key. */ rx->key_master = ehdr->master_key; if (!rx->key_master) tx->legacy_user = 1; /* For later cases, apply only if message is destined to this node */ if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self) return; /* Case 1: Peer has no keys, let's make master key take over */ if (ehdr->rx_nokey) { /* Set or extend grace period */ tx->timer2 = jiffies; /* Schedule key distributing for the peer if not yet */ if (tx->key.keys && !atomic_cmpxchg(&rx->key_distr, 0, KEY_DISTR_SCHED)) { get_random_bytes(&delay, 2); delay %= 5; delay = msecs_to_jiffies(500 * ++delay); if (queue_delayed_work(tx->wq, &rx->work, delay)) tipc_node_get(rx->node); } } else { /* Cancel a pending key distributing if any */ atomic_xchg(&rx->key_distr, 0); } /* Case 2: Peer RX active key has changed, let's update own TX users */ cur = atomic_read(&rx->peer_rx_active); new = ehdr->rx_key_active; if (tx->key.keys && cur != new && atomic_cmpxchg(&rx->peer_rx_active, cur, new) == cur) { if (new) tipc_aead_users_inc(tx->aead[new], INT_MAX); if (cur) tipc_aead_users_dec(tx->aead[cur], 0); atomic64_set(&rx->sndnxt, 0); /* Mark the point TX key users changed */ tx->timer1 = jiffies; pr_debug("%s: key users changed %d-- %d++, peer %s\n", tx->name, cur, new, rx->name); } } static int tipc_crypto_key_revoke(struct net *net, u8 tx_key) { struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_key key; spin_lock_bh(&tx->lock); key = tx->key; WARN_ON(!key.active || tx_key != key.active); /* Free the active key */ tipc_crypto_key_set_state(tx, key.passive, 0, key.pending); tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); spin_unlock_bh(&tx->lock); pr_warn("%s: key is revoked\n", tx->name); return -EKEYREVOKED; } int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, struct tipc_node *node) { struct tipc_crypto *c; if (*crypto) return -EEXIST; /* Allocate crypto */ c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return -ENOMEM; /* Allocate workqueue on TX */ if (!node) { c->wq = alloc_ordered_workqueue("tipc_crypto", 0); if (!c->wq) { kfree(c); return -ENOMEM; } } /* Allocate statistic structure */ c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); if (!c->stats) { if (c->wq) destroy_workqueue(c->wq); kfree_sensitive(c); return -ENOMEM; } c->flags = 0; c->net = net; c->node = node; get_random_bytes(&c->key_gen, 2); tipc_crypto_key_set_state(c, 0, 0, 0); atomic_set(&c->key_distr, 0); atomic_set(&c->peer_rx_active, 0); atomic64_set(&c->sndnxt, 0); c->timer1 = jiffies; c->timer2 = jiffies; c->rekeying_intv = TIPC_REKEYING_INTV_DEF; spin_lock_init(&c->lock); scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX", (is_rx(c)) ? tipc_node_get_id_str(c->node) : tipc_own_id_string(c->net)); if (is_rx(c)) INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx); else INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx); *crypto = c; return 0; } void tipc_crypto_stop(struct tipc_crypto **crypto) { struct tipc_crypto *c = *crypto; u8 k; if (!c) return; /* Flush any queued works & destroy wq */ if (is_tx(c)) { c->rekeying_intv = 0; cancel_delayed_work_sync(&c->work); destroy_workqueue(c->wq); } /* Release AEAD keys */ rcu_read_lock(); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_aead_put(rcu_dereference(c->aead[k])); rcu_read_unlock(); pr_debug("%s: has been stopped\n", c->name); /* Free this crypto statistics */ free_percpu(c->stats); *crypto = NULL; kfree_sensitive(c); } void tipc_crypto_timeout(struct tipc_crypto *rx) { struct tipc_net *tn = tipc_net(rx->net); struct tipc_crypto *tx = tn->crypto_tx; struct tipc_key key; int cmd; /* TX pending: taking all users & stable -> active */ spin_lock(&tx->lock); key = tx->key; if (key.active && tipc_aead_users(tx->aead[key.active]) > 0) goto s1; if (!key.pending || tipc_aead_users(tx->aead[key.pending]) <= 0) goto s1; if (time_before(jiffies, tx->timer1 + TIPC_TX_LASTING_TIME)) goto s1; tipc_crypto_key_set_state(tx, key.passive, key.pending, 0); if (key.active) tipc_crypto_key_detach(tx->aead[key.active], &tx->lock); this_cpu_inc(tx->stats->stat[STAT_SWITCHES]); pr_info("%s: key[%d] is activated\n", tx->name, key.pending); s1: spin_unlock(&tx->lock); /* RX pending: having user -> active */ spin_lock(&rx->lock); key = rx->key; if (!key.pending || tipc_aead_users(rx->aead[key.pending]) <= 0) goto s2; if (key.active) key.passive = key.active; key.active = key.pending; rx->timer2 = jiffies; tipc_crypto_key_set_state(rx, key.passive, key.active, 0); this_cpu_inc(rx->stats->stat[STAT_SWITCHES]); pr_info("%s: key[%d] is activated\n", rx->name, key.pending); goto s5; s2: /* RX pending: not working -> remove */ if (!key.pending || tipc_aead_users(rx->aead[key.pending]) > -10) goto s3; tipc_crypto_key_set_state(rx, key.passive, key.active, 0); tipc_crypto_key_detach(rx->aead[key.pending], &rx->lock); pr_debug("%s: key[%d] is removed\n", rx->name, key.pending); goto s5; s3: /* RX active: timed out or no user -> pending */ if (!key.active) goto s4; if (time_before(jiffies, rx->timer1 + TIPC_RX_ACTIVE_LIM) && tipc_aead_users(rx->aead[key.active]) > 0) goto s4; if (key.pending) key.passive = key.active; else key.pending = key.active; rx->timer2 = jiffies; tipc_crypto_key_set_state(rx, key.passive, 0, key.pending); tipc_aead_users_set(rx->aead[key.pending], 0); pr_debug("%s: key[%d] is deactivated\n", rx->name, key.active); goto s5; s4: /* RX passive: outdated or not working -> free */ if (!key.passive) goto s5; if (time_before(jiffies, rx->timer2 + TIPC_RX_PASSIVE_LIM) && tipc_aead_users(rx->aead[key.passive]) > -10) goto s5; tipc_crypto_key_set_state(rx, 0, key.active, key.pending); tipc_crypto_key_detach(rx->aead[key.passive], &rx->lock); pr_debug("%s: key[%d] is freed\n", rx->name, key.passive); s5: spin_unlock(&rx->lock); /* Relax it here, the flag will be set again if it really is, but only * when we are not in grace period for safety! */ if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) tx->legacy_user = 0; /* Limit max_tfms & do debug commands if needed */ if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) return; cmd = sysctl_tipc_max_tfms; sysctl_tipc_max_tfms = TIPC_MAX_TFMS_DEF; tipc_crypto_do_cmd(rx->net, cmd); } static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode, u8 type) { struct sk_buff *skb; skb = skb_clone(_skb, GFP_ATOMIC); if (skb) { TIPC_SKB_CB(skb)->xmit_type = type; tipc_crypto_xmit(net, &skb, b, dst, __dnode); if (skb) b->media->send_msg(net, skb, b, dst); } } /** * tipc_crypto_xmit - Build & encrypt TIPC message for xmit * @net: struct net * @skb: input/output message skb pointer * @b: bearer used for xmit later * @dst: destination media address * @__dnode: destination node for reference if any * * First, build an encryption message header on the top of the message, then * encrypt the original TIPC message by using the pending, master or active * key with this preference order. * If the encryption is successful, the encrypted skb is returned directly or * via the callback. * Otherwise, the skb is freed! * * Return: * * 0 : the encryption has succeeded (or no encryption) * * -EINPROGRESS/-EBUSY : the encryption is ongoing, a callback will be made * * -ENOKEK : the encryption has failed due to no key * * -EKEYREVOKED : the encryption has failed due to key revoked * * -ENOMEM : the encryption has failed due to no memory * * < 0 : the encryption has failed due to other reasons */ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_bearer *b, struct tipc_media_addr *dst, struct tipc_node *__dnode) { struct tipc_crypto *__rx = tipc_node_crypto_rx(__dnode); struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats = tx->stats; struct tipc_msg *hdr = buf_msg(*skb); struct tipc_key key = tx->key; struct tipc_aead *aead = NULL; u32 user = msg_user(hdr); u32 type = msg_type(hdr); int rc = -ENOKEY; u8 tx_key = 0; /* No encryption? */ if (!tx->working) return 0; /* Pending key if peer has active on it or probing time */ if (unlikely(key.pending)) { tx_key = key.pending; if (!tx->key_master && !key.active) goto encrypt; if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) goto encrypt; if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) { pr_debug("%s: probing for key[%d]\n", tx->name, key.pending); goto encrypt; } if (user == LINK_CONFIG || user == LINK_PROTOCOL) tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, SKB_PROBING); } /* Master key if this is a *vital* message or in grace period */ if (tx->key_master) { tx_key = KEY_MASTER; if (!key.active) goto encrypt; if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) { pr_debug("%s: gracing for msg (%d %d)\n", tx->name, user, type); goto encrypt; } if (user == LINK_CONFIG || (user == LINK_PROTOCOL && type == RESET_MSG) || (user == MSG_CRYPTO && type == KEY_DISTR_MSG) || time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) { if (__rx && __rx->key_master && !atomic_read(&__rx->peer_rx_active)) goto encrypt; if (!__rx) { if (likely(!tx->legacy_user)) goto encrypt; tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, SKB_GRACING); } } } /* Else, use the active key if any */ if (likely(key.active)) { tx_key = key.active; goto encrypt; } goto exit; encrypt: aead = tipc_aead_get(tx->aead[tx_key]); if (unlikely(!aead)) goto exit; rc = tipc_ehdr_build(net, aead, tx_key, *skb, __rx); if (likely(rc > 0)) rc = tipc_aead_encrypt(aead, *skb, b, dst, __dnode); exit: switch (rc) { case 0: this_cpu_inc(stats->stat[STAT_OK]); break; case -EINPROGRESS: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); if (rc == -ENOKEY) this_cpu_inc(stats->stat[STAT_NOKEYS]); else if (rc == -EKEYREVOKED) this_cpu_inc(stats->stat[STAT_BADKEYS]); kfree_skb(*skb); *skb = NULL; break; } tipc_aead_put(aead); return rc; } /** * tipc_crypto_rcv - Decrypt an encrypted TIPC message from peer * @net: struct net * @rx: RX crypto handle * @skb: input/output message skb pointer * @b: bearer where the message has been received * * If the decryption is successful, the decrypted skb is returned directly or * as the callback, the encryption header and auth tag will be trimmed out * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). * Otherwise, the skb will be freed! * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX * cluster key(s) can be taken for decryption (- recursive). * * Return: * * 0 : the decryption has successfully completed * * -EINPROGRESS/-EBUSY : the decryption is ongoing, a callback will be made * * -ENOKEY : the decryption has failed due to no key * * -EBADMSG : the decryption has failed due to bad message * * -ENOMEM : the decryption has failed due to no memory * * < 0 : the decryption has failed due to other reasons */ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b) { struct tipc_crypto *tx = tipc_net(net)->crypto_tx; struct tipc_crypto_stats __percpu *stats; struct tipc_aead *aead = NULL; struct tipc_key key; int rc = -ENOKEY; u8 tx_key, n; tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* New peer? * Let's try with TX key (i.e. cluster mode) & verify the skb first! */ if (unlikely(!rx || tx_key == KEY_MASTER)) goto pick_tx; /* Pick RX key according to TX key if any */ key = rx->key; if (tx_key == key.active || tx_key == key.pending || tx_key == key.passive) goto decrypt; /* Unknown key, let's try to align RX key(s) */ if (tipc_crypto_key_try_align(rx, tx_key)) goto decrypt; pick_tx: /* No key suitable? Try to pick one from TX... */ aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key); if (aead) goto decrypt; goto exit; decrypt: rcu_read_lock(); if (!aead) aead = tipc_aead_get(rx->aead[tx_key]); rc = tipc_aead_decrypt(net, aead, *skb, b); rcu_read_unlock(); exit: stats = ((rx) ?: tx)->stats; switch (rc) { case 0: this_cpu_inc(stats->stat[STAT_OK]); break; case -EINPROGRESS: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); if (rc == -ENOKEY) { kfree_skb(*skb); *skb = NULL; if (rx) { /* Mark rx->nokey only if we dont have a * pending received session key, nor a newer * one i.e. in the next slot. */ n = key_next(tx_key); rx->nokey = !(rx->skey || rcu_access_pointer(rx->aead[n])); pr_debug_ratelimited("%s: nokey %d, key %d/%x\n", rx->name, rx->nokey, tx_key, rx->key.keys); tipc_node_put(rx->node); } this_cpu_inc(stats->stat[STAT_NOKEYS]); return rc; } else if (rc == -EBADMSG) { this_cpu_inc(stats->stat[STAT_BADMSGS]); } break; } tipc_crypto_rcv_complete(net, aead, b, skb, rc); return rc; } static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(*skb); struct tipc_crypto *rx = aead->crypto; struct tipc_aead *tmp = NULL; struct tipc_ehdr *ehdr; struct tipc_node *n; /* Is this completed by TX? */ if (unlikely(is_tx(aead->crypto))) { rx = skb_cb->tx_clone_ctx.rx; pr_debug("TX->RX(%s): err %d, aead %p, skb->next %p, flags %x\n", (rx) ? tipc_node_get_id_str(rx->node) : "-", err, aead, (*skb)->next, skb_cb->flags); pr_debug("skb_cb [recurs %d, last %p], tx->aead [%p %p %p]\n", skb_cb->tx_clone_ctx.recurs, skb_cb->tx_clone_ctx.last, aead->crypto->aead[1], aead->crypto->aead[2], aead->crypto->aead[3]); if (unlikely(err)) { if (err == -EBADMSG && (*skb)->next) tipc_rcv(net, (*skb)->next, b); goto free_skb; } if (likely((*skb)->next)) { kfree_skb((*skb)->next); (*skb)->next = NULL; } ehdr = (struct tipc_ehdr *)(*skb)->data; if (!rx) { WARN_ON(ehdr->user != LINK_CONFIG); n = tipc_node_create(net, 0, ehdr->id, 0xffffu, 0, true); rx = tipc_node_crypto_rx(n); if (unlikely(!rx)) goto free_skb; } /* Ignore cloning if it was TX master key */ if (ehdr->tx_key == KEY_MASTER) goto rcv; if (tipc_aead_clone(&tmp, aead) < 0) goto rcv; WARN_ON(!refcount_inc_not_zero(&tmp->refcnt)); if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) { tipc_aead_free(&tmp->rcu); goto rcv; } tipc_aead_put(aead); aead = tmp; } if (unlikely(err)) { tipc_aead_users_dec((struct tipc_aead __force __rcu *)aead, INT_MIN); goto free_skb; } /* Set the RX key's user */ tipc_aead_users_set((struct tipc_aead __force __rcu *)aead, 1); /* Mark this point, RX works */ rx->timer1 = jiffies; rcv: /* Remove ehdr & auth. tag prior to tipc_rcv() */ ehdr = (struct tipc_ehdr *)(*skb)->data; /* Mark this point, RX passive still works */ if (rx->key.passive && ehdr->tx_key == rx->key.passive) rx->timer2 = jiffies; skb_reset_network_header(*skb); skb_pull(*skb, tipc_ehdr_size(ehdr)); if (pskb_trim(*skb, (*skb)->len - aead->authsize)) goto free_skb; /* Validate TIPCv2 message */ if (unlikely(!tipc_msg_validate(skb))) { pr_err_ratelimited("Packet dropped after decryption!\n"); goto free_skb; } /* Ok, everything's fine, try to synch own keys according to peers' */ tipc_crypto_key_synch(rx, *skb); /* Re-fetch skb cb as skb might be changed in tipc_msg_validate */ skb_cb = TIPC_SKB_CB(*skb); /* Mark skb decrypted */ skb_cb->decrypted = 1; /* Clear clone cxt if any */ if (likely(!skb_cb->tx_clone_deferred)) goto exit; skb_cb->tx_clone_deferred = 0; memset(&skb_cb->tx_clone_ctx, 0, sizeof(skb_cb->tx_clone_ctx)); goto exit; free_skb: kfree_skb(*skb); *skb = NULL; exit: tipc_aead_put(aead); if (rx) tipc_node_put(rx->node); } static void tipc_crypto_do_cmd(struct net *net, int cmd) { struct tipc_net *tn = tipc_net(net); struct tipc_crypto *tx = tn->crypto_tx, *rx; struct list_head *p; unsigned int stat; int i, j, cpu; char buf[200]; /* Currently only one command is supported */ switch (cmd) { case 0xfff1: goto print_stats; default: return; } print_stats: /* Print a header */ pr_info("\n=============== TIPC Crypto Statistics ===============\n\n"); /* Print key status */ pr_info("Key status:\n"); pr_info("TX(%7.7s)\n%s", tipc_own_id_string(net), tipc_crypto_key_dump(tx, buf)); rcu_read_lock(); for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { rx = tipc_node_crypto_rx_by_list(p); pr_info("RX(%7.7s)\n%s", tipc_node_get_id_str(rx->node), tipc_crypto_key_dump(rx, buf)); } rcu_read_unlock(); /* Print crypto statistics */ for (i = 0, j = 0; i < MAX_STATS; i++) j += scnprintf(buf + j, 200 - j, "|%11s ", hstats[i]); pr_info("Counter %s", buf); memset(buf, '-', 115); buf[115] = '\0'; pr_info("%s\n", buf); j = scnprintf(buf, 200, "TX(%7.7s) ", tipc_own_id_string(net)); for_each_possible_cpu(cpu) { for (i = 0; i < MAX_STATS; i++) { stat = per_cpu_ptr(tx->stats, cpu)->stat[i]; j += scnprintf(buf + j, 200 - j, "|%11d ", stat); } pr_info("%s", buf); j = scnprintf(buf, 200, "%12s", " "); } rcu_read_lock(); for (p = tn->node_list.next; p != &tn->node_list; p = p->next) { rx = tipc_node_crypto_rx_by_list(p); j = scnprintf(buf, 200, "RX(%7.7s) ", tipc_node_get_id_str(rx->node)); for_each_possible_cpu(cpu) { for (i = 0; i < MAX_STATS; i++) { stat = per_cpu_ptr(rx->stats, cpu)->stat[i]; j += scnprintf(buf + j, 200 - j, "|%11d ", stat); } pr_info("%s", buf); j = scnprintf(buf, 200, "%12s", " "); } } rcu_read_unlock(); pr_info("\n======================== Done ========================\n"); } static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) { struct tipc_key key = c->key; struct tipc_aead *aead; int k, i = 0; char *s; for (k = KEY_MIN; k <= KEY_MAX; k++) { if (k == KEY_MASTER) { if (is_rx(c)) continue; if (time_before(jiffies, c->timer2 + TIPC_TX_GRACE_PERIOD)) s = "ACT"; else s = "PAS"; } else { if (k == key.passive) s = "PAS"; else if (k == key.active) s = "ACT"; else if (k == key.pending) s = "PEN"; else s = "-"; } i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); rcu_read_lock(); aead = rcu_dereference(c->aead[k]); if (aead) i += scnprintf(buf + i, 200 - i, "{\"0x...%s\", \"%s\"}/%d:%d", aead->hint, (aead->mode == CLUSTER_KEY) ? "c" : "p", atomic_read(&aead->users), refcount_read(&aead->refcnt)); rcu_read_unlock(); i += scnprintf(buf + i, 200 - i, "\n"); } if (is_rx(c)) i += scnprintf(buf + i, 200 - i, "\tPeer RX active: %d\n", atomic_read(&c->peer_rx_active)); return buf; } static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, char *buf) { struct tipc_key *key = &old; int k, i = 0; char *s; /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ again: i += scnprintf(buf + i, 32 - i, "["); for (k = KEY_1; k <= KEY_3; k++) { if (k == key->passive) s = "pas"; else if (k == key->active) s = "act"; else if (k == key->pending) s = "pen"; else s = "-"; i += scnprintf(buf + i, 32 - i, (k != KEY_3) ? "%s " : "%s", s); } if (key != &new) { i += scnprintf(buf + i, 32 - i, "] -> "); key = &new; goto again; } i += scnprintf(buf + i, 32 - i, "]"); return buf; } /** * tipc_crypto_msg_rcv - Common 'MSG_CRYPTO' processing point * @net: the struct net * @skb: the receiving message buffer */ void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb) { struct tipc_crypto *rx; struct tipc_msg *hdr; if (unlikely(skb_linearize(skb))) goto exit; hdr = buf_msg(skb); rx = tipc_node_crypto_rx_by_addr(net, msg_prevnode(hdr)); if (unlikely(!rx)) goto exit; switch (msg_type(hdr)) { case KEY_DISTR_MSG: if (tipc_crypto_key_rcv(rx, hdr)) goto exit; break; default: break; } tipc_node_put(rx->node); exit: kfree_skb(skb); } /** * tipc_crypto_key_distr - Distribute a TX key * @tx: the TX crypto * @key: the key's index * @dest: the destination tipc node, = NULL if distributing to all nodes * * Return: 0 in case of success, otherwise < 0 */ int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key, struct tipc_node *dest) { struct tipc_aead *aead; u32 dnode = tipc_node_get_addr(dest); int rc = -ENOKEY; if (!sysctl_tipc_key_exchange_enabled) return 0; if (key) { rcu_read_lock(); aead = tipc_aead_get(tx->aead[key]); if (likely(aead)) { rc = tipc_crypto_key_xmit(tx->net, aead->key, aead->gen, aead->mode, dnode); tipc_aead_put(aead); } rcu_read_unlock(); } return rc; } /** * tipc_crypto_key_xmit - Send a session key * @net: the struct net * @skey: the session key to be sent * @gen: the key's generation * @mode: the key's mode * @dnode: the destination node address, = 0 if broadcasting to all nodes * * The session key 'skey' is packed in a TIPC v2 'MSG_CRYPTO/KEY_DISTR_MSG' * as its data section, then xmit-ed through the uc/bc link. * * Return: 0 in case of success, otherwise < 0 */ static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode) { struct sk_buff_head pkts; struct tipc_msg *hdr; struct sk_buff *skb; u16 size, cong_link_cnt; u8 *data; int rc; size = tipc_aead_key_size(skey); skb = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); if (!skb) return -ENOMEM; hdr = buf_msg(skb); tipc_msg_init(tipc_own_addr(net), hdr, MSG_CRYPTO, KEY_DISTR_MSG, INT_H_SIZE, dnode); msg_set_size(hdr, INT_H_SIZE + size); msg_set_key_gen(hdr, gen); msg_set_key_mode(hdr, mode); data = msg_data(hdr); *((__be32 *)(data + TIPC_AEAD_ALG_NAME)) = htonl(skey->keylen); memcpy(data, skey->alg_name, TIPC_AEAD_ALG_NAME); memcpy(data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->key, skey->keylen); __skb_queue_head_init(&pkts); __skb_queue_tail(&pkts, skb); if (dnode) rc = tipc_node_xmit(net, &pkts, dnode, 0); else rc = tipc_bcast_xmit(net, &pkts, &cong_link_cnt); return rc; } /** * tipc_crypto_key_rcv - Receive a session key * @rx: the RX crypto * @hdr: the TIPC v2 message incl. the receiving session key in its data * * This function retrieves the session key in the message from peer, then * schedules a RX work to attach the key to the corresponding RX crypto. * * Return: "true" if the key has been scheduled for attaching, otherwise * "false". */ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) { struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; struct tipc_aead_key *skey = NULL; u16 key_gen = msg_key_gen(hdr); u32 size = msg_data_sz(hdr); u8 *data = msg_data(hdr); unsigned int keylen; /* Verify whether the size can exist in the packet */ if (unlikely(size < sizeof(struct tipc_aead_key) + TIPC_AEAD_KEYLEN_MIN)) { pr_debug("%s: message data size is too small\n", rx->name); goto exit; } keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); /* Verify the supplied size values */ if (unlikely(keylen > TIPC_AEAD_KEY_SIZE_MAX || size != keylen + sizeof(struct tipc_aead_key))) { pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); goto exit; } spin_lock(&rx->lock); if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) { pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name, rx->skey, key_gen, rx->key_gen); goto exit_unlock; } /* Allocate memory for the key */ skey = kmalloc(size, GFP_ATOMIC); if (unlikely(!skey)) { pr_err("%s: unable to allocate memory for skey\n", rx->name); goto exit_unlock; } /* Copy key from msg data */ skey->keylen = keylen; memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME); memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->keylen); rx->key_gen = key_gen; rx->skey_mode = msg_key_mode(hdr); rx->skey = skey; rx->nokey = 0; mb(); /* for nokey flag */ exit_unlock: spin_unlock(&rx->lock); exit: /* Schedule the key attaching on this crypto */ if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0))) return true; return false; } /** * tipc_crypto_work_rx - Scheduled RX works handler * @work: the struct RX work * * The function processes the previous scheduled works i.e. distributing TX key * or attaching a received session key on RX crypto. */ static void tipc_crypto_work_rx(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct tipc_crypto *rx = container_of(dwork, struct tipc_crypto, work); struct tipc_crypto *tx = tipc_net(rx->net)->crypto_tx; unsigned long delay = msecs_to_jiffies(5000); bool resched = false; u8 key; int rc; /* Case 1: Distribute TX key to peer if scheduled */ if (atomic_cmpxchg(&rx->key_distr, KEY_DISTR_SCHED, KEY_DISTR_COMPL) == KEY_DISTR_SCHED) { /* Always pick the newest one for distributing */ key = tx->key.pending ?: tx->key.active; rc = tipc_crypto_key_distr(tx, key, rx->node); if (unlikely(rc)) pr_warn("%s: unable to distr key[%d] to %s, err %d\n", tx->name, key, tipc_node_get_id_str(rx->node), rc); /* Sched for key_distr releasing */ resched = true; } else { atomic_cmpxchg(&rx->key_distr, KEY_DISTR_COMPL, 0); } /* Case 2: Attach a pending received session key from peer if any */ if (rx->skey) { rc = tipc_crypto_key_init(rx, rx->skey, rx->skey_mode, false); if (unlikely(rc < 0)) pr_warn("%s: unable to attach received skey, err %d\n", rx->name, rc); switch (rc) { case -EBUSY: case -ENOMEM: /* Resched the key attaching */ resched = true; break; default: synchronize_rcu(); kfree(rx->skey); rx->skey = NULL; break; } } if (resched && queue_delayed_work(tx->wq, &rx->work, delay)) return; tipc_node_put(rx->node); } /** * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval * @tx: TX crypto * @changed: if the rekeying needs to be rescheduled with new interval * @new_intv: new rekeying interval (when "changed" = true) */ void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, u32 new_intv) { unsigned long delay; bool now = false; if (changed) { if (new_intv == TIPC_REKEYING_NOW) now = true; else tx->rekeying_intv = new_intv; cancel_delayed_work_sync(&tx->work); } if (tx->rekeying_intv || now) { delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000; queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay)); } } /** * tipc_crypto_work_tx - Scheduled TX works handler * @work: the struct TX work * * The function processes the previous scheduled work, i.e. key rekeying, by * generating a new session key based on current one, then attaching it to the * TX crypto and finally distributing it to peers. It also re-schedules the * rekeying if needed. */ static void tipc_crypto_work_tx(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work); struct tipc_aead_key *skey = NULL; struct tipc_key key = tx->key; struct tipc_aead *aead; int rc = -ENOMEM; if (unlikely(key.pending)) goto resched; /* Take current key as a template */ rcu_read_lock(); aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]); if (unlikely(!aead)) { rcu_read_unlock(); /* At least one key should exist for securing */ return; } /* Lets duplicate it first */ skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC); rcu_read_unlock(); /* Now, generate new key, initiate & distribute it */ if (likely(skey)) { rc = tipc_aead_key_generate(skey) ?: tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false); if (likely(rc > 0)) rc = tipc_crypto_key_distr(tx, rc, NULL); kfree_sensitive(skey); } if (unlikely(rc)) pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc); resched: /* Re-schedule rekeying if any */ tipc_crypto_rekeying_sched(tx, false, 0); }
10 10 10 10 10 5 124 124 124 123 124 123 90 92 91 53 91 92 92 33 4 2 2 33 3 31 3 29 28 28 33 28 10 51 5 5 5 5 50 49 23 30 30 30 27 26 25 10 10 10 10 10 10 10 10 10 51 10 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 /* * linux/drivers/video/fbmem.c * * Copyright (C) 1994 Martin Schaller * * 2001 - Documented with DocBook * - Brad Douglas <brad@neruo.com> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #include <linux/console.h> #include <linux/export.h> #include <linux/fb.h> #include <linux/fbcon.h> #include <linux/lcd.h> #include <linux/leds.h> #include <video/nomodeset.h> #include "fb_internal.h" /* * Frame buffer device initialization and setup routines */ #define FBPIXMAPSIZE (1024 * 8) struct class *fb_class; DEFINE_MUTEX(registration_lock); struct fb_info *registered_fb[FB_MAX] __read_mostly; int num_registered_fb __read_mostly; #define for_each_registered_fb(i) \ for (i = 0; i < FB_MAX; i++) \ if (!registered_fb[i]) {} else struct fb_info *get_fb_info(unsigned int idx) { struct fb_info *fb_info; if (idx >= FB_MAX) return ERR_PTR(-ENODEV); mutex_lock(&registration_lock); fb_info = registered_fb[idx]; if (fb_info) refcount_inc(&fb_info->count); mutex_unlock(&registration_lock); return fb_info; } void put_fb_info(struct fb_info *fb_info) { if (!refcount_dec_and_test(&fb_info->count)) return; if (fb_info->fbops->fb_destroy) fb_info->fbops->fb_destroy(fb_info); } /* * Helpers */ int fb_get_color_depth(struct fb_var_screeninfo *var, struct fb_fix_screeninfo *fix) { int depth = 0; if (fix->visual == FB_VISUAL_MONO01 || fix->visual == FB_VISUAL_MONO10) depth = 1; else { if (var->green.length == var->blue.length && var->green.length == var->red.length && var->green.offset == var->blue.offset && var->green.offset == var->red.offset) depth = var->green.length; else depth = var->green.length + var->red.length + var->blue.length; } return depth; } EXPORT_SYMBOL(fb_get_color_depth); /* * Data padding functions. */ void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height) { __fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, height); } EXPORT_SYMBOL(fb_pad_aligned_buffer); void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, u32 height, u32 shift_high, u32 shift_low, u32 mod) { u8 mask = (u8) (0xfff << shift_high), tmp; int i, j; for (i = height; i--; ) { for (j = 0; j < idx; j++) { tmp = dst[j]; tmp &= mask; tmp |= *src >> shift_low; dst[j] = tmp; tmp = *src << shift_high; dst[j+1] = tmp; src++; } tmp = dst[idx]; tmp &= mask; tmp |= *src >> shift_low; dst[idx] = tmp; if (shift_high < mod) { tmp = *src << shift_high; dst[idx+1] = tmp; } src++; dst += d_pitch; } } EXPORT_SYMBOL(fb_pad_unaligned_buffer); /* * we need to lock this section since fb_cursor * may use fb_imageblit() */ char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size) { u32 align = buf->buf_align - 1, offset; char *addr = buf->addr; /* If IO mapped, we need to sync before access, no sharing of * the pixmap is done */ if (buf->flags & FB_PIXMAP_IO) { if (info->fbops->fb_sync && (buf->flags & FB_PIXMAP_SYNC)) info->fbops->fb_sync(info); return addr; } /* See if we fit in the remaining pixmap space */ offset = buf->offset + align; offset &= ~align; if (offset + size > buf->size) { /* We do not fit. In order to be able to re-use the buffer, * we must ensure no asynchronous DMA'ing or whatever operation * is in progress, we sync for that. */ if (info->fbops->fb_sync && (buf->flags & FB_PIXMAP_SYNC)) info->fbops->fb_sync(info); offset = 0; } buf->offset = offset + size; addr += offset; return addr; } EXPORT_SYMBOL(fb_get_buffer_offset); int fb_pan_display(struct fb_info *info, struct fb_var_screeninfo *var) { struct fb_fix_screeninfo *fix = &info->fix; unsigned int yres = info->var.yres; int err = 0; if (var->yoffset > 0) { if (var->vmode & FB_VMODE_YWRAP) { if (!fix->ywrapstep || (var->yoffset % fix->ywrapstep)) err = -EINVAL; else yres = 0; } else if (!fix->ypanstep || (var->yoffset % fix->ypanstep)) err = -EINVAL; } if (var->xoffset > 0 && (!fix->xpanstep || (var->xoffset % fix->xpanstep))) err = -EINVAL; if (err || !info->fbops->fb_pan_display || var->yoffset > info->var.yres_virtual - yres || var->xoffset > info->var.xres_virtual - info->var.xres) return -EINVAL; if ((err = info->fbops->fb_pan_display(var, info))) return err; info->var.xoffset = var->xoffset; info->var.yoffset = var->yoffset; if (var->vmode & FB_VMODE_YWRAP) info->var.vmode |= FB_VMODE_YWRAP; else info->var.vmode &= ~FB_VMODE_YWRAP; return 0; } EXPORT_SYMBOL(fb_pan_display); static int fb_check_caps(struct fb_info *info, struct fb_var_screeninfo *var, u32 activate) { struct fb_blit_caps caps, fbcaps; int err = 0; memset(&caps, 0, sizeof(caps)); memset(&fbcaps, 0, sizeof(fbcaps)); caps.flags = (activate & FB_ACTIVATE_ALL) ? 1 : 0; fbcon_get_requirement(info, &caps); info->fbops->fb_get_caps(info, &fbcaps, var); if (!bitmap_subset(caps.x, fbcaps.x, FB_MAX_BLIT_WIDTH) || !bitmap_subset(caps.y, fbcaps.y, FB_MAX_BLIT_HEIGHT) || (fbcaps.len < caps.len)) err = -EINVAL; return err; } static void fb_lcd_notify_mode_change(struct fb_info *info, struct fb_videomode *mode) { lcd_notify_mode_change_all(info->device, mode->xres, mode->yres); } int fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var) { int ret = 0; u32 activate; struct fb_var_screeninfo old_var; struct fb_videomode mode; u32 unused; if (var->activate & FB_ACTIVATE_INV_MODE) { struct fb_videomode mode1, mode2; fb_var_to_videomode(&mode1, var); fb_var_to_videomode(&mode2, &info->var); /* make sure we don't delete the videomode of current var */ ret = fb_mode_is_equal(&mode1, &mode2); if (!ret) { ret = fbcon_mode_deleted(info, &mode1); if (!ret) fb_delete_videomode(&mode1, &info->modelist); } return ret ? -EINVAL : 0; } if (!(var->activate & FB_ACTIVATE_FORCE) && !memcmp(&info->var, var, sizeof(struct fb_var_screeninfo))) return 0; activate = var->activate; /* When using FOURCC mode, make sure the red, green, blue and * transp fields are set to 0. */ if ((info->fix.capabilities & FB_CAP_FOURCC) && var->grayscale > 1) { if (var->red.offset || var->green.offset || var->blue.offset || var->transp.offset || var->red.length || var->green.length || var->blue.length || var->transp.length || var->red.msb_right || var->green.msb_right || var->blue.msb_right || var->transp.msb_right) return -EINVAL; } if (!info->fbops->fb_check_var) { *var = info->var; return 0; } /* bitfill_aligned() assumes that it's at least 8x8 */ if (var->xres < 8 || var->yres < 8) return -EINVAL; /* Too huge resolution causes multiplication overflow. */ if (check_mul_overflow(var->xres, var->yres, &unused) || check_mul_overflow(var->xres_virtual, var->yres_virtual, &unused)) return -EINVAL; ret = info->fbops->fb_check_var(var, info); if (ret) return ret; /* verify that virtual resolution >= physical resolution */ if (var->xres_virtual < var->xres || var->yres_virtual < var->yres) { pr_warn("WARNING: fbcon: Driver '%s' missed to adjust virtual screen size (%ux%u vs. %ux%u)\n", info->fix.id, var->xres_virtual, var->yres_virtual, var->xres, var->yres); return -EINVAL; } if ((var->activate & FB_ACTIVATE_MASK) != FB_ACTIVATE_NOW) return 0; if (info->fbops->fb_get_caps) { ret = fb_check_caps(info, var, activate); if (ret) return ret; } old_var = info->var; info->var = *var; if (info->fbops->fb_set_par) { ret = info->fbops->fb_set_par(info); if (ret) { info->var = old_var; printk(KERN_WARNING "detected " "fb_set_par error, " "error code: %d\n", ret); return ret; } } fb_pan_display(info, &info->var); fb_set_cmap(&info->cmap, info); fb_var_to_videomode(&mode, &info->var); if (info->modelist.prev && info->modelist.next && !list_empty(&info->modelist)) ret = fb_add_videomode(&mode, &info->modelist); if (ret) { info->var = old_var; return ret; } fb_lcd_notify_mode_change(info, &mode); return 0; } EXPORT_SYMBOL(fb_set_var); static void fb_lcd_notify_blank(struct fb_info *info) { int power; switch (info->blank) { case FB_BLANK_UNBLANK: power = LCD_POWER_ON; break; /* deprecated; TODO: should become 'off' */ case FB_BLANK_NORMAL: power = LCD_POWER_REDUCED; break; case FB_BLANK_VSYNC_SUSPEND: power = LCD_POWER_REDUCED_VSYNC_SUSPEND; break; /* 'off' */ case FB_BLANK_HSYNC_SUSPEND: case FB_BLANK_POWERDOWN: default: power = LCD_POWER_OFF; break; } lcd_notify_blank_all(info->device, power); } static void fb_ledtrig_backlight_notify_blank(struct fb_info *info) { if (info->blank == FB_BLANK_UNBLANK) ledtrig_backlight_blank(false); else ledtrig_backlight_blank(true); } int fb_blank(struct fb_info *info, int blank) { int old_blank = info->blank; int ret; if (!info->fbops->fb_blank) return -EINVAL; if (blank > FB_BLANK_POWERDOWN) blank = FB_BLANK_POWERDOWN; info->blank = blank; ret = info->fbops->fb_blank(blank, info); if (ret) goto err; fb_bl_notify_blank(info, old_blank); fb_lcd_notify_blank(info); fb_ledtrig_backlight_notify_blank(info); return 0; err: info->blank = old_blank; return ret; } EXPORT_SYMBOL(fb_blank); static int fb_check_foreignness(struct fb_info *fi) { const bool foreign_endian = fi->flags & FBINFO_FOREIGN_ENDIAN; fi->flags &= ~FBINFO_FOREIGN_ENDIAN; #ifdef __BIG_ENDIAN fi->flags |= foreign_endian ? 0 : FBINFO_BE_MATH; #else fi->flags |= foreign_endian ? FBINFO_BE_MATH : 0; #endif /* __BIG_ENDIAN */ if (fi->flags & FBINFO_BE_MATH && !fb_be_math(fi)) { pr_err("%s: enable CONFIG_FB_BIG_ENDIAN to " "support this framebuffer\n", fi->fix.id); return -ENOSYS; } else if (!(fi->flags & FBINFO_BE_MATH) && fb_be_math(fi)) { pr_err("%s: enable CONFIG_FB_LITTLE_ENDIAN to " "support this framebuffer\n", fi->fix.id); return -ENOSYS; } return 0; } static int do_register_framebuffer(struct fb_info *fb_info) { int i, err = 0; struct fb_videomode mode; if (fb_check_foreignness(fb_info)) return -ENOSYS; if (num_registered_fb == FB_MAX) return -ENXIO; for (i = 0 ; i < FB_MAX; i++) if (!registered_fb[i]) break; if (i >= FB_MAX) return -ENXIO; if (!fb_info->modelist.prev || !fb_info->modelist.next) INIT_LIST_HEAD(&fb_info->modelist); fb_var_to_videomode(&mode, &fb_info->var); err = fb_add_videomode(&mode, &fb_info->modelist); if (err < 0) return err; fb_info->node = i; refcount_set(&fb_info->count, 1); mutex_init(&fb_info->lock); mutex_init(&fb_info->mm_lock); /* * With an fb_blank callback present, we assume that the * display is blank, so that fb_blank() enables it on the * first modeset. */ if (fb_info->fbops->fb_blank) fb_info->blank = FB_BLANK_POWERDOWN; fb_device_create(fb_info); if (fb_info->pixmap.addr == NULL) { fb_info->pixmap.addr = kmalloc(FBPIXMAPSIZE, GFP_KERNEL); if (fb_info->pixmap.addr) { fb_info->pixmap.size = FBPIXMAPSIZE; fb_info->pixmap.buf_align = 1; fb_info->pixmap.scan_align = 1; fb_info->pixmap.access_align = 32; fb_info->pixmap.flags = FB_PIXMAP_DEFAULT; } } fb_info->pixmap.offset = 0; if (bitmap_empty(fb_info->pixmap.blit_x, FB_MAX_BLIT_WIDTH)) bitmap_fill(fb_info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); if (bitmap_empty(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT)) bitmap_fill(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT); if (fb_info->skip_vt_switch) pm_vt_switch_required(fb_info->device, false); else pm_vt_switch_required(fb_info->device, true); num_registered_fb++; registered_fb[i] = fb_info; #ifdef CONFIG_GUMSTIX_AM200EPD { struct fb_event event; event.info = fb_info; fb_notifier_call_chain(FB_EVENT_FB_REGISTERED, &event); } #endif return fbcon_fb_registered(fb_info); } static void unbind_console(struct fb_info *fb_info) { int i = fb_info->node; if (WARN_ON(i < 0 || i >= FB_MAX || registered_fb[i] != fb_info)) return; fbcon_fb_unbind(fb_info); } static void unlink_framebuffer(struct fb_info *fb_info) { int i; i = fb_info->node; if (WARN_ON(i < 0 || i >= FB_MAX || registered_fb[i] != fb_info)) return; fb_device_destroy(fb_info); pm_vt_switch_unregister(fb_info->device); unbind_console(fb_info); } static void do_unregister_framebuffer(struct fb_info *fb_info) { unlink_framebuffer(fb_info); if (fb_info->pixmap.addr && (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT)) { kfree(fb_info->pixmap.addr); fb_info->pixmap.addr = NULL; } fbcon_delete_modelist(&fb_info->modelist); fb_destroy_modelist(&fb_info->modelist); registered_fb[fb_info->node] = NULL; num_registered_fb--; #ifdef CONFIG_GUMSTIX_AM200EPD { struct fb_event event; event.info = fb_info; fb_notifier_call_chain(FB_EVENT_FB_UNREGISTERED, &event); } #endif fbcon_fb_unregistered(fb_info); /* this may free fb info */ put_fb_info(fb_info); } /** * register_framebuffer - registers a frame buffer device * @fb_info: frame buffer info structure * * Registers a frame buffer device @fb_info. * * Returns negative errno on error, or zero for success. * */ int register_framebuffer(struct fb_info *fb_info) { int ret; mutex_lock(&registration_lock); ret = do_register_framebuffer(fb_info); mutex_unlock(&registration_lock); return ret; } EXPORT_SYMBOL(register_framebuffer); /** * unregister_framebuffer - releases a frame buffer device * @fb_info: frame buffer info structure * * Unregisters a frame buffer device @fb_info. * * Returns negative errno on error, or zero for success. * * This function will also notify the framebuffer console * to release the driver. * * This is meant to be called within a driver's module_exit() * function. If this is called outside module_exit(), ensure * that the driver implements fb_open() and fb_release() to * check that no processes are using the device. */ void unregister_framebuffer(struct fb_info *fb_info) { mutex_lock(&registration_lock); do_unregister_framebuffer(fb_info); mutex_unlock(&registration_lock); } EXPORT_SYMBOL(unregister_framebuffer); static void devm_unregister_framebuffer(void *data) { struct fb_info *info = data; unregister_framebuffer(info); } /** * devm_register_framebuffer - resource-managed frame buffer device registration * @dev: device the framebuffer belongs to * @fb_info: frame buffer info structure * * Registers a frame buffer device @fb_info to device @dev. * * Returns negative errno on error, or zero for success. * */ int devm_register_framebuffer(struct device *dev, struct fb_info *fb_info) { int ret; ret = register_framebuffer(fb_info); if (ret) return ret; return devm_add_action_or_reset(dev, devm_unregister_framebuffer, fb_info); } EXPORT_SYMBOL(devm_register_framebuffer); /** * fb_set_suspend - low level driver signals suspend * @info: framebuffer affected * @state: 0 = resuming, !=0 = suspending * * This is meant to be used by low level drivers to * signal suspend/resume to the core & clients. * It must be called with the console semaphore held */ void fb_set_suspend(struct fb_info *info, int state) { WARN_CONSOLE_UNLOCKED(); if (state) { fbcon_suspended(info); info->state = FBINFO_STATE_SUSPENDED; } else { info->state = FBINFO_STATE_RUNNING; fbcon_resumed(info); } } EXPORT_SYMBOL(fb_set_suspend); static int __init fbmem_init(void) { int ret; fb_class = class_create("graphics"); if (IS_ERR(fb_class)) { ret = PTR_ERR(fb_class); pr_err("Unable to create fb class; errno = %d\n", ret); goto err_fb_class; } ret = fb_init_procfs(); if (ret) goto err_class_destroy; ret = fb_register_chrdev(); if (ret) goto err_fb_cleanup_procfs; fb_console_init(); return 0; err_fb_cleanup_procfs: fb_cleanup_procfs(); err_class_destroy: class_destroy(fb_class); err_fb_class: fb_class = NULL; return ret; } #ifdef MODULE static void __exit fbmem_exit(void) { fb_console_exit(); fb_unregister_chrdev(); fb_cleanup_procfs(); class_destroy(fb_class); } module_init(fbmem_init); module_exit(fbmem_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Framebuffer base"); #else subsys_initcall(fbmem_init); #endif int fb_new_modelist(struct fb_info *info) { struct fb_var_screeninfo var = info->var; struct list_head *pos, *n; struct fb_modelist *modelist; struct fb_videomode *m, mode; int err; list_for_each_safe(pos, n, &info->modelist) { modelist = list_entry(pos, struct fb_modelist, list); m = &modelist->mode; fb_videomode_to_var(&var, m); var.activate = FB_ACTIVATE_TEST; err = fb_set_var(info, &var); fb_var_to_videomode(&mode, &var); if (err || !fb_mode_is_equal(m, &mode)) { list_del(pos); kfree(pos); } } if (list_empty(&info->modelist)) return 1; fbcon_new_modelist(info); return 0; } bool fb_modesetting_disabled(const char *drvname) { bool fwonly = video_firmware_drivers_only(); if (fwonly) pr_warn("Driver %s not loading because of nomodeset parameter\n", drvname); return fwonly; } EXPORT_SYMBOL(fb_modesetting_disabled); MODULE_LICENSE("GPL");
8 8 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor policy definitions. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #ifndef __AA_POLICY_H #define __AA_POLICY_H #include <linux/capability.h> #include <linux/cred.h> #include <linux/kref.h> #include <linux/rhashtable.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/socket.h> #include "apparmor.h" #include "audit.h" #include "capability.h" #include "domain.h" #include "file.h" #include "lib.h" #include "label.h" #include "net.h" #include "perms.h" #include "resource.h" struct aa_ns; extern int unprivileged_userns_apparmor_policy; extern int aa_unprivileged_unconfined_restricted; extern const char *const aa_profile_mode_names[]; #define APPARMOR_MODE_NAMES_MAX_INDEX 4 #define PROFILE_MODE(_profile, _mode) \ ((aa_g_profile_mode == (_mode)) || \ ((_profile)->mode == (_mode))) #define COMPLAIN_MODE(_profile) PROFILE_MODE((_profile), APPARMOR_COMPLAIN) #define USER_MODE(_profile) PROFILE_MODE((_profile), APPARMOR_USER) #define KILL_MODE(_profile) PROFILE_MODE((_profile), APPARMOR_KILL) #define PROFILE_IS_HAT(_profile) ((_profile)->label.flags & FLAG_HAT) #define CHECK_DEBUG1(_profile) ((_profile)->label.flags & FLAG_DEBUG1) #define CHECK_DEBUG2(_profile) ((_profile)->label.flags & FLAG_DEBUG2) #define profile_is_stale(_profile) (label_is_stale(&(_profile)->label)) #define on_list_rcu(X) (!list_empty(X) && (X)->prev != LIST_POISON2) /* flags in the dfa accept2 table */ enum dfa_accept_flags { ACCEPT_FLAG_OWNER = 1, }; /* * FIXME: currently need a clean way to replace and remove profiles as a * set. It should be done at the namespace level. * Either, with a set of profiles loaded at the namespace level or via * a mark and remove marked interface. */ enum profile_mode { APPARMOR_ENFORCE, /* enforce access rules */ APPARMOR_COMPLAIN, /* allow and log access violations */ APPARMOR_KILL, /* kill task on access violation */ APPARMOR_UNCONFINED, /* profile set to unconfined */ APPARMOR_USER, /* modified complain mode to userspace */ }; /* struct aa_policydb - match engine for a policy * count: refcount for the pdb * dfa: dfa pattern match * perms: table of permissions * strs: table of strings, index by x * start: set of start states for the different classes of data */ struct aa_policydb { struct kref count; struct aa_dfa *dfa; struct { struct aa_perms *perms; u32 size; }; struct aa_str_table trans; aa_state_t start[AA_CLASS_LAST + 1]; }; extern struct aa_policydb *nullpdb; struct aa_policydb *aa_alloc_pdb(gfp_t gfp); void aa_pdb_free_kref(struct kref *kref); /** * aa_get_pdb - increment refcount on @pdb * @pdb: policydb (MAYBE NULL) * * Returns: pointer to @pdb if @pdb is NULL will return NULL * Requires: @pdb must be held with valid refcount when called */ static inline struct aa_policydb *aa_get_pdb(struct aa_policydb *pdb) { if (pdb) kref_get(&(pdb->count)); return pdb; } /** * aa_put_pdb - put a pdb refcount * @pdb: pdb to put refcount (MAYBE NULL) * * Requires: if @pdb != NULL that a valid refcount be held */ static inline void aa_put_pdb(struct aa_policydb *pdb) { if (pdb) kref_put(&pdb->count, aa_pdb_free_kref); } /* lookup perm that doesn't have and object conditional */ static inline struct aa_perms *aa_lookup_perms(struct aa_policydb *policy, aa_state_t state) { unsigned int index = ACCEPT_TABLE(policy->dfa)[state]; if (!(policy->perms)) return &default_perms; return &(policy->perms[index]); } /* struct aa_data - generic data structure * key: name for retrieving this data * size: size of data in bytes * data: binary data * head: reserved for rhashtable */ struct aa_data { char *key; u32 size; char *data; struct rhash_head head; }; /* struct aa_ruleset - data covering mediation rules * @list: list the rule is on * @size: the memory consumed by this ruleset * @policy: general match rules governing policy * @file: The set of rules governing basic file access and domain transitions * @caps: capabilities for the profile * @rlimits: rlimits for the profile * @secmark_count: number of secmark entries * @secmark: secmark label match info */ struct aa_ruleset { int size; /* TODO: merge policy and file */ struct aa_policydb *policy; struct aa_policydb *file; struct aa_caps caps; struct aa_rlimit rlimits; int secmark_count; struct aa_secmark *secmark; }; /* struct aa_attachment - data and rules for a profiles attachment * @list: * @xmatch_str: human readable attachment string * @xmatch: optional extended matching for unconfined executables names * @xmatch_len: xmatch prefix len, used to determine xmatch priority * @xattr_count: number of xattrs in table * @xattrs: table of xattrs */ struct aa_attachment { const char *xmatch_str; struct aa_policydb *xmatch; unsigned int xmatch_len; int xattr_count; char **xattrs; }; /* struct aa_profile - basic confinement data * @base - base components of the profile (name, refcount, lists, lock ...) * @parent: parent of profile * @ns: namespace the profile is in * @rename: optional profile name that this profile renamed * * @audit: the auditing mode of the profile * @mode: the enforcement mode of the profile * @path_flags: flags controlling path generation behavior * @signal: the signal that should be used when kill is used * @disconnected: what to prepend if attach_disconnected is specified * @attach: attachment rules for the profile * @rules: rules to be enforced * * learning_cache: the accesses learned in complain mode * raw_data: rawdata of the loaded profile policy * hash: cryptographic hash of the profile * @dents: dentries for the profiles file entries in apparmorfs * @dirname: name of the profile dir in apparmorfs * @dents: set of dentries associated with the profile * @data: hashtable for free-form policy aa_data * @label - label this profile is an extension of * @rules - label with the rule vec on its end * * The AppArmor profile contains the basic confinement data. Each profile * has a name, and exists in a namespace. The @name and @exec_match are * used to determine profile attachment against unconfined tasks. All other * attachments are determined by profile X transition rules. * * Profiles have a hierarchy where hats and children profiles keep * a reference to their parent. * * Profile names can not begin with a : and can not contain the \0 * character. If a profile name begins with / it will be considered when * determining profile attachment on "unconfined" tasks. */ struct aa_profile { struct aa_policy base; struct aa_profile __rcu *parent; struct aa_ns *ns; const char *rename; enum audit_mode audit; long mode; u32 path_flags; int signal; const char *disconnected; struct aa_attachment attach; struct aa_loaddata *rawdata; unsigned char *hash; char *dirname; struct dentry *dents[AAFS_PROF_SIZEOF]; struct rhashtable *data; int n_rules; /* special - variable length must be last entry in profile */ struct aa_label label; }; extern enum profile_mode aa_g_profile_mode; #define AA_MAY_LOAD_POLICY AA_MAY_APPEND #define AA_MAY_REPLACE_POLICY AA_MAY_WRITE #define AA_MAY_REMOVE_POLICY AA_MAY_DELETE #define profiles_ns(P) ((P)->ns) #define name_is_shared(A, B) ((A)->hname && (A)->hname == (B)->hname) struct aa_ruleset *aa_alloc_ruleset(gfp_t gfp); struct aa_profile *aa_alloc_profile(const char *name, struct aa_proxy *proxy, gfp_t gfp); struct aa_profile *aa_alloc_null(struct aa_profile *parent, const char *name, gfp_t gfp); struct aa_profile *aa_new_learning_profile(struct aa_profile *parent, bool hat, const char *base, gfp_t gfp); void aa_free_profile(struct aa_profile *profile); struct aa_profile *aa_find_child(struct aa_profile *parent, const char *name); struct aa_profile *aa_lookupn_profile(struct aa_ns *ns, const char *hname, size_t n); struct aa_profile *aa_fqlookupn_profile(struct aa_label *base, const char *fqname, size_t n); ssize_t aa_replace_profiles(struct aa_ns *view, struct aa_label *label, u32 mask, struct aa_loaddata *udata); ssize_t aa_remove_profiles(struct aa_ns *view, struct aa_label *label, char *name, size_t size); void __aa_profile_list_release(struct list_head *head); #define profile_unconfined(X) ((X)->mode == APPARMOR_UNCONFINED) /** * aa_get_newest_profile - simple wrapper fn to wrap the label version * @p: profile (NOT NULL) * * Returns refcount to newest version of the profile (maybe @p) * * Requires: @p must be held with a valid refcount */ static inline struct aa_profile *aa_get_newest_profile(struct aa_profile *p) { return labels_profile(aa_get_newest_label(&p->label)); } static inline aa_state_t RULE_MEDIATES(struct aa_ruleset *rules, unsigned char class) { if (class <= AA_CLASS_LAST) return rules->policy->start[class]; else return aa_dfa_match_len(rules->policy->dfa, rules->policy->start[0], &class, 1); } static inline aa_state_t RULE_MEDIATES_v9NET(struct aa_ruleset *rules) { return RULE_MEDIATES(rules, AA_CLASS_NETV9); } static inline aa_state_t RULE_MEDIATES_NET(struct aa_ruleset *rules) { /* can not use RULE_MEDIATE_v9AF here, because AF match fail * can not be distiguished from class match fail, and we only * fallback to checking older class on class match failure */ aa_state_t state = RULE_MEDIATES(rules, AA_CLASS_NETV9); /* fallback and check v7/8 if v9 is NOT mediated */ if (!state) state = RULE_MEDIATES(rules, AA_CLASS_NET); return state; } void aa_compute_profile_mediates(struct aa_profile *profile); static inline bool profile_mediates(struct aa_profile *profile, unsigned char class) { return label_mediates(&profile->label, class); } static inline bool profile_mediates_safe(struct aa_profile *profile, unsigned char class) { return label_mediates_safe(&profile->label, class); } /** * aa_get_profile - increment refcount on profile @p * @p: profile (MAYBE NULL) * * Returns: pointer to @p if @p is NULL will return NULL * Requires: @p must be held with valid refcount when called */ static inline struct aa_profile *aa_get_profile(struct aa_profile *p) { if (p) kref_get(&(p->label.count)); return p; } /** * aa_get_profile_not0 - increment refcount on profile @p found via lookup * @p: profile (MAYBE NULL) * * Returns: pointer to @p if @p is NULL will return NULL * Requires: @p must be held with valid refcount when called */ static inline struct aa_profile *aa_get_profile_not0(struct aa_profile *p) { if (p && kref_get_unless_zero(&p->label.count)) return p; return NULL; } /** * aa_get_profile_rcu - increment a refcount profile that can be replaced * @p: pointer to profile that can be replaced (NOT NULL) * * Returns: pointer to a refcounted profile. * else NULL if no profile */ static inline struct aa_profile *aa_get_profile_rcu(struct aa_profile __rcu **p) { struct aa_profile *c; rcu_read_lock(); do { c = rcu_dereference(*p); } while (c && !kref_get_unless_zero(&c->label.count)); rcu_read_unlock(); return c; } /** * aa_put_profile - decrement refcount on profile @p * @p: profile (MAYBE NULL) */ static inline void aa_put_profile(struct aa_profile *p) { if (p) kref_put(&p->label.count, aa_label_kref); } static inline int AUDIT_MODE(struct aa_profile *profile) { if (aa_g_audit != AUDIT_NORMAL) return aa_g_audit; return profile->audit; } bool aa_policy_view_capable(const struct cred *subj_cred, struct aa_label *label, struct aa_ns *ns); bool aa_policy_admin_capable(const struct cred *subj_cred, struct aa_label *label, struct aa_ns *ns); int aa_may_manage_policy(const struct cred *subj_cred, struct aa_label *label, struct aa_ns *ns, u32 mask); bool aa_current_policy_view_capable(struct aa_ns *ns); bool aa_current_policy_admin_capable(struct aa_ns *ns); #endif /* __AA_POLICY_H */
8 8 7 7 7 6 6 6 5 6 4 13 12 11 11 13 8 8 6 8 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 // SPDX-License-Identifier: GPL-2.0 /* * Implement mseal() syscall. * * Copyright (c) 2023,2024 Google, Inc. * * Author: Jeff Xu <jeffxu@chromium.org> */ #include <linux/mempolicy.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/syscalls.h> #include <linux/sched.h> #include "internal.h" /* * mseal() disallows an input range which contain unmapped ranges (VMA holes). * * It disallows unmapped regions from start to end whether they exist at the * start, in the middle, or at the end of the range, or any combination thereof. * * This is because after sealng a range, there's nothing to stop memory mapping * of ranges in the remaining gaps later, meaning that the user might then * wrongly consider the entirety of the mseal()'d range to be sealed when it * in fact isn't. */ /* * Does the [start, end) range contain any unmapped memory? * * We ensure that: * - start is part of a valid VMA. * - end is part of a valid VMA. * - no gap (unallocated memory) exists between start and end. */ static bool range_contains_unmapped(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *vma; unsigned long prev_end = start; VMA_ITERATOR(vmi, current->mm, start); for_each_vma_range(vmi, vma, end) { if (vma->vm_start > prev_end) return true; prev_end = vma->vm_end; } return prev_end < end; } static int mseal_apply(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *vma, *prev; unsigned long curr_start = start; VMA_ITERATOR(vmi, mm, start); /* We know there are no gaps so this will be non-NULL. */ vma = vma_iter_load(&vmi); prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; for_each_vma_range(vmi, vma, end) { unsigned long curr_end = MIN(vma->vm_end, end); if (!(vma->vm_flags & VM_SEALED)) { vma = vma_modify_flags(&vmi, prev, vma, curr_start, curr_end, vma->vm_flags | VM_SEALED); if (IS_ERR(vma)) return PTR_ERR(vma); vm_flags_set(vma, VM_SEALED); } prev = vma; curr_start = curr_end; } return 0; } /* * mseal(2) seals the VM's meta data from * selected syscalls. * * addr/len: VM address range. * * The address range by addr/len must meet: * start (addr) must be in a valid VMA. * end (addr + len) must be in a valid VMA. * no gap (unallocated memory) between start and end. * start (addr) must be page aligned. * * len: len will be page aligned implicitly. * * Below VMA operations are blocked after sealing. * 1> Unmapping, moving to another location, and shrinking * the size, via munmap() and mremap(), can leave an empty * space, therefore can be replaced with a VMA with a new * set of attributes. * 2> Moving or expanding a different vma into the current location, * via mremap(). * 3> Modifying a VMA via mmap(MAP_FIXED). * 4> Size expansion, via mremap(), does not appear to pose any * specific risks to sealed VMAs. It is included anyway because * the use case is unclear. In any case, users can rely on * merging to expand a sealed VMA. * 5> mprotect and pkey_mprotect. * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED) * for anonymous memory, when users don't have write permission to the * memory. Those behaviors can alter region contents by discarding pages, * effectively a memset(0) for anonymous memory. * * flags: reserved. * * return values: * zero: success. * -EINVAL: * invalid input flags. * start address is not page aligned. * Address arange (start + len) overflow. * -ENOMEM: * addr is not a valid address (not allocated). * end (start + len) is not a valid address. * a gap (unallocated memory) between start and end. * -EPERM: * - In 32 bit architecture, sealing is not supported. * Note: * user can call mseal(2) multiple times, adding a seal on an * already sealed memory is a no-action (no error). * * unseal() is not supported. */ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) { size_t len; int ret = 0; unsigned long end; struct mm_struct *mm = current->mm; /* Verify flags not set. */ if (flags) return -EINVAL; start = untagged_addr(start); if (!PAGE_ALIGNED(start)) return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero. */ if (len_in && !len) return -EINVAL; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; if (mmap_write_lock_killable(mm)) return -EINTR; if (range_contains_unmapped(mm, start, end)) { ret = -ENOMEM; goto out; } /* * Second pass, this should success, unless there are errors * from vma_modify_flags, e.g. merge/split error, or process * reaching the max supported VMAs, however, those cases shall * be rare. */ ret = mseal_apply(mm, start, end); out: mmap_write_unlock(mm); return ret; } SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long, flags) { return do_mseal(start, len, flags); }
48 35 66 51 26 17 26 31 31 24 1188 145 31 2330 2362 39 77 170 170 2 2 2 7081 1134 1134 1128 170 171 108 74 170 169 171 2 2 3279 30 93 93 93 3164 3160 3165 3160 3177 30 3164 3163 1257 1253 27 1255 1032 1155 145 133 1032 70 70 68 2338 4 1235 1238 1 1155 1 975 68 70 8 2 2 65 85 118 170 1 1 1 6 6 5 6 6 6 5 6 6 6 3020 3013 3014 3012 1 1 510 2341 2332 2341 512 512 511 505 510 512 3 3 3 3 39 38 39 39 39 79 79 38 38 77 77 77 74 75 75 79 80 64 78 73 104 75 75 31 29 25 29 29 29 29 29 11 11 26 63 93 92 32 93 93 32 36 104 103 90 90 90 104 104 42 104 104 50 111 39 39 63 63 62 63 63 170 170 170 108 7 107 108 30 108 34 101 64 101 101 19 31 31 30 31 36 73 35 35 1 35 35 1 35 35 36 85 84 2 85 85 53 5071 5063 5060 5001 5027 1132 1122 85 1122 83 85 85 85 84 53 53 31 5662 5048 39 5051 5754 5775 5753 39 37 38 25 29 32 17 16 7 7 7 7 7 7 6 7 7 7 7 140 139 140 139 140 25 26 137 140 31 31 31 31 50 28 22 21 21 31 8 31 31 31 31 31 31 31 8 31 31 31 31 31 31 31 31 31 4 28 26 31 31 4 4 4 4 4 4 16 16 3 2 1 1 13 13 4 4 9 9 9 9 5 4 4 3 9 1 1 1 1 1 1 1 1 291 12 19 17 17 16 19 19 17 19 22 22 18 24 24 30 11 32 32 32 31 31 29 29 29 29 29 29 29 29 29 29 29 1 1 31 5 5 5 5 5 5 3 1 4 4 4 2 2 2 2 63 62 62 62 63 63 63 63 62 63 63 63 63 63 63 62 63 63 63 63 63 63 62 63 63 63 63 1 1 1 1 48 8 8 43 5 9 49 49 47 9 9 48 49 49 49 45 1 45 20 43 9 9 3 76 77 76 13 77 77 63 64 64 64 11 10 4 5 4 5 4 4 4 2 4 4 7 3 3 3 7 7 7 4 2 4 1 1 7 7 6 6 7 3 6 6 6 7 6 6 5 14 11 11 11 11 3 11 3 8 11 2 7 2 10 1 6 6 1 5 14 14 14 14 14 7 1 2 7 10 47 43 19 19 8 8 7 7 7 7 7 6 6 5 5 5 4 4 3 3 3 4 5 1 1 7 7 7 6 5 3 3 3 3 3 2 34 34 34 34 34 5 1 1 1 1 9 5 5 5 5 5 5 5 1 1 4 4 4 2 1 71 18 66 65 4 64 64 72 54 54 15 38 37 37 38 87 87 87 87 87 86 13 83 54 54 87 34 34 34 34 34 34 32 34 34 34 4 4 1 4 4 4 120 42 42 4 121 3 122 115 1 114 39 115 115 116 105 103 12 103 38 104 42 104 32 104 10 104 9 102 2 104 53 103 14 104 5 104 8 98 6 92 1 91 6 88 2 87 117 114 45 1 26 18 26 26 45 45 1 44 30 30 44 44 45 29 29 507 509 478 31 30 508 2 29 29 28 29 29 28 29 28 29 29 28 29 123 1 122 1 119 3 118 121 122 121 22 16 22 11 21 10 22 3 22 7 22 14 14 13 14 12 11 1 8 10 9 8 8 7 7 7 7 7 7 7 7 7 6 1 7 8 16 11 15 7 9 31 31 31 24 14 29 29 27 2 28 27 17 4 24 6 5 18 5 19 2 22 22 14 1 20 7 6 14 3 14 14 27 18 38 7 7 12 12 13 12 2 10 9 2 8 8 2 8 7 7 7 7 7 5 4 3 3 3 3 3 3 3 3 3 3 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 1 2 2 3 2 2 1 2 2 2 2 2 1 3 9 5 1 4 3 2 22 13 12 12 11 6 5 1 1 5 9 4 4 22 20 19 18 16 13 21 20 19 6 19 1 20 1 20 16 4 3 4 21 6 4 3 4 2 2 1 2 1 2 2 3 4 9 3 25 23 21 14 14 14 13 14 13 8 12 14 14 14 14 14 14 14 10 6 6 6 6 6 2 6 6 6 6 6 3 2 2 1 1 1 2 2 2 2 2 2 10 10 10 10 6 6 6 6 6 6 2 2 2 2 6 2 2 2 2 2 2 10 10 10 6 6 6 10 10 6 6 10 10 10 22 27 30 30 8 21 30 22 22 21 22 22 15 17 15 14 15 9 15 15 14 15 14 15 10 15 10 11 6 11 6 11 6 11 2 11 2 11 6 11 2 11 2 11 2 11 6 11 2 11 18 16 11 11 10 52 46 47 46 45 52 41 11 31 10 10 25 19 18 25 23 3 20 19 16 18 18 17 17 11 15 17 4 11 15 15 15 7 8 6 13 2 13 13 3 1 3 10 7 3 13 10 10 7 32 24 23 22 16 33 33 31 30 29 24 8 8 15 15 7 94 6 6 66 6 6 6 6 45 44 45 4 4 4 4 6 12 11 11 11 11 10 10 10 12 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/namespace.c * * (C) Copyright Al Viro 2000, 2001 * * Based on code from fs/super.c, copyright Linus Torvalds and others. * Heavily rewritten. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/capability.h> #include <linux/mnt_namespace.h> #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/idr.h> #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/file.h> #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> #include <linux/pidfs.h> #include <linux/nstree.h> #include "pnode.h" #include "internal.h" /* Maximum number of mounts in a mount namespace */ static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __ro_after_init; static unsigned int m_hash_shift __ro_after_init; static unsigned int mp_hash_mask __ro_after_init; static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { if (!str) return 0; mhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("mhash_entries=", set_mhash_entries); static __initdata unsigned long mphash_entries; static int __init set_mphash_entries(char *str) { if (!str) return 0; mphash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("mphash_entries=", set_mphash_entries); static char * __initdata initramfs_options; static int __init initramfs_options_setup(char *str) { initramfs_options = str; return 1; } __setup("initramfs_options=", initramfs_options_setup); static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ #define MNT_UNIQUE_ID_OFFSET (1ULL << 31) static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET; static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ static inline void namespace_lock(void); static void namespace_unlock(void); DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock()) DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem), up_read(&namespace_sem)) DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T)) #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), }; struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; enum mount_kattr_flags_t kflags; struct user_namespace *mnt_userns; struct mnt_idmap *mnt_idmap; }; /* /sys/fs */ struct kobject *fs_kobj __ro_after_init; EXPORT_SYMBOL_GPL(fs_kobj); /* * vfsmount lock may be taken for read to prevent changes to the * vfsmount hash, ie. during mountpoint lookups or walking back * up the tree. * * It should be taken for write in all cases where the vfsmount * tree or hash is modified or when a vfsmount structure is modified. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { struct ns_common *ns; if (!node) return NULL; ns = rb_entry(node, struct ns_common, ns_tree_node); return container_of(ns, struct mnt_namespace, ns); } static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ if (ns && refcount_dec_and_test(&ns->passive)) { fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); } } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); } static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (ns_tree_active(ns)) ns_tree_remove(ns); call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); } /* * Lookup a mount namespace by id and take a passive reference count. Taking a * passive reference means the mount namespace can be emptied if e.g., the last * task holding an active reference exits. To access the mounts of the * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. * * Note the lookup is lockless protected by a sequence counter. We only * need to guard against false negatives as false positives aren't * possible. So if we didn't find a mount namespace and the sequence * counter has changed we need to retry. If the sequence counter is * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { struct mnt_namespace *mnt_ns; struct ns_common *ns; guard(rcu)(); ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); if (!ns) return NULL; /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ mnt_ns = container_of(ns, struct mnt_namespace, ns); refcount_inc(&mnt_ns->passive); return mnt_ns; } static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); } static inline void unlock_mount_hash(void) { write_sequnlock(&mount_lock); } static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> m_hash_shift); return &mount_hashtable[tmp & m_hash_mask]; } static inline struct hlist_head *mp_hash(struct dentry *dentry) { unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> mp_hash_shift); return &mountpoint_hashtable[tmp & mp_hash_mask]; } static int mnt_alloc_id(struct mount *mnt) { int res; xa_lock(&mnt_id_xa); res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL); if (!res) mnt->mnt_id_unique = ++mnt_id_ctr; xa_unlock(&mnt_id_xa); return res; } static void mnt_free_id(struct mount *mnt) { xa_erase(&mnt_id_xa, mnt->mnt_id); } /* * Allocate a new peer group ID */ static int mnt_alloc_group_id(struct mount *mnt) { int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL); if (res < 0) return res; mnt->mnt_group_id = res; return 0; } /* * Release a peer group ID */ void mnt_release_group_id(struct mount *mnt) { ida_free(&mnt_group_ida, mnt->mnt_group_id); mnt->mnt_group_id = 0; } /* * vfsmount lock must be held for read */ static inline void mnt_add_count(struct mount *mnt, int n) { #ifdef CONFIG_SMP this_cpu_add(mnt->mnt_pcp->mnt_count, n); #else preempt_disable(); mnt->mnt_count += n; preempt_enable(); #endif } /* * vfsmount lock must be held for write */ int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; } return count; #else return mnt->mnt_count; #endif } static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { int err; err = mnt_alloc_id(mnt); if (err) goto out_free_cache; if (name) mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL_ACCOUNT); else mnt->mnt_devname = "none"; if (!mnt->mnt_devname) goto out_free_id; #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); if (!mnt->mnt_pcp) goto out_free_devname; this_cpu_add(mnt->mnt_pcp->mnt_count, 1); #else mnt->mnt_count = 1; mnt->mnt_writers = 0; #endif INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); INIT_HLIST_HEAD(&mnt->mnt_slave_list); INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; #ifdef CONFIG_SMP out_free_devname: kfree_const(mnt->mnt_devname); #endif out_free_id: mnt_free_id(mnt); out_free_cache: kmem_cache_free(mnt_cache, mnt); return NULL; } /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). * We must keep track of when those operations start * (for permission checks) and when they end, so that * we can determine when writes are able to occur to * a filesystem. */ /* * __mnt_is_readonly: check whether a mount is read-only * @mnt: the mount to check for its write status * * This shouldn't be used directly ouside of the VFS. * It does not guarantee that the filesystem will stay * r/w, just that it is right *now*. This can not and * should not be used in place of IS_RDONLY(inode). * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(__mnt_is_readonly); static inline void mnt_inc_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_inc(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers++; #endif } static inline void mnt_dec_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_dec(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers--; #endif } static unsigned int mnt_get_writers(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; } return count; #else return mnt->mnt_writers; #endif } static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; /* * The barrier pairs with the barrier in sb_start_ro_state_change() * making sure if we don't see s_readonly_remount set yet, we also will * not see any superblock / mount flag changes done by remount. * It also pairs with the barrier in sb_end_ro_state_change() * assuring that if we see s_readonly_remount already cleared, we will * see the values of superblock / mount flags updated by remount. */ smp_rmb(); return __mnt_is_readonly(mnt); } /* * Most r/o & frozen checks on a fs are for operations that take discrete * amounts of time, like a write() or unlink(). We must keep track of when * those operations start (for permission checks) and when they end, so that we * can determine when writes are able to occur to a filesystem. */ /** * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ int mnt_get_write_access(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; preempt_disable(); mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass * WRITE_HOLD loop below, so that the slowpath can see our * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); read_seqlock_excl(&mount_lock); read_sequnlock_excl(&mount_lock); preempt_disable(); } } /* * The barrier pairs with the barrier sb_start_ro_state_change() making * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. */ smp_rmb(); if (mnt_is_readonly(m)) { mnt_dec_writers(mnt); ret = -EROFS; } preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(mnt_get_write_access); /** * mnt_want_write - get write access to a mount * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mount is read-write, filesystem * is not frozen) before returning success. When the write operation is * finished, mnt_drop_write() must be called. This is effectively a refcount. */ int mnt_want_write(struct vfsmount *m) { int ret; sb_start_write(m->mnt_sb); ret = mnt_get_write_access(m); if (ret) sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); /** * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be * paired with mnt_put_write_access_file. */ int mnt_get_write_access_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* * Superblock may have become readonly while there are still * writable fd's, e.g. due to a fs error with errors=remount-ro */ if (__mnt_is_readonly(file->f_path.mnt)) return -EROFS; return 0; } return mnt_get_write_access(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but if the file is already open for writing it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the freeze protection and the check for emergency r/o * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { int ret; sb_start_write(file_inode(file)->i_sb); ret = mnt_get_write_access_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write_file); /** * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with * mnt_get_write_access() call above. */ void mnt_put_write_access(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_put_write_access); /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done performing writes to it and * also allows filesystem to be frozen again. Must be matched with * mnt_want_write() call above. */ void mnt_drop_write(struct vfsmount *mnt) { mnt_put_write_access(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); void mnt_put_write_access_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) mnt_put_write_access(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { mnt_put_write_access_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); /** * mnt_hold_writers - prevent write access to the given mount * @mnt: mnt to prevent write access to * * Prevents write access to @mnt if there are no active writers for @mnt. * This function needs to be called and return successfully before changing * properties of @mnt that need to remain stable for callers with write access * to @mnt. * * After this functions has been called successfully callers must pair it with * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * * Context: This function expects to be in mount_locked_reader scope serializing * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { set_write_hold(mnt); /* * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); /* * With writers on hold, if this value is zero, then there are * definitely no active writers (although held writers may subsequently * increment the count, they'll have to wait, and decrement it after * seeing MNT_READONLY). * * It is OK to have counter incremented on one CPU and decremented on * another: the sum will add up correctly. The danger would be when we * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) return -EBUSY; return 0; } /** * mnt_unhold_writers - stop preventing write access to the given mount * @mnt: mnt to stop preventing write access to * * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * * This function can only be called after a call to mnt_hold_writers(). * * Context: This function expects to be in the same mount_locked_reader scope * as the matching mnt_hold_writers(). */ static inline void mnt_unhold_writers(struct mount *mnt) { if (!test_write_hold(mnt)) return; /* * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); clear_write_hold(mnt); } static inline void mnt_del_instance(struct mount *m) { struct mount **p = m->mnt_pprev_for_sb; struct mount *next = m->mnt_next_for_sb; if (next) next->mnt_pprev_for_sb = p; *p = next; } static inline void mnt_add_instance(struct mount *m, struct super_block *s) { struct mount *first = s->s_mounts; if (first) first->mnt_pprev_for_sb = &m->mnt_next_for_sb; m->mnt_next_for_sb = first; m->mnt_pprev_for_sb = &s->s_mounts; s->s_mounts = m; } static int mnt_make_readonly(struct mount *mnt) { int ret; ret = mnt_hold_writers(mnt); if (!ret) mnt->mnt.mnt_flags |= MNT_READONLY; mnt_unhold_writers(mnt); return ret; } int sb_prepare_remount_readonly(struct super_block *sb) { int err = 0; /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; guard(mount_locked_reader)(); for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { if (!(m->mnt.mnt_flags & MNT_READONLY)) { err = mnt_hold_writers(m); if (err) break; } } if (!err && atomic_long_read(&sb->s_remove_count)) err = -EBUSY; if (!err) sb_start_ro_state_change(sb); for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { if (test_write_hold(m)) clear_write_hold(m); } return err; } static void free_vfsmnt(struct mount *mnt) { mnt_idmap_put(mnt_idmap(&mnt->mnt)); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } static void delayed_free_vfsmnt(struct rcu_head *head) { free_vfsmnt(container_of(head, struct mount, mnt_rcu)); } /* call under rcu_read_lock */ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) { struct mount *mnt; if (read_seqretry(&mount_lock, seq)) return 1; if (bastard == NULL) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; lock_mount_hash(); if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; } unlock_mount_hash(); /* caller will mntput() */ return -1; } /* call under rcu_read_lock */ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) return true; if (unlikely(res < 0)) { rcu_read_unlock(); mntput(bastard); rcu_read_lock(); } return false; } /** * __lookup_mnt - mount hash lookup * @mnt: parent mount * @dentry: dentry of mountpoint * * If @mnt has a child mount @c mounted on @dentry find and return it. * Caller must either hold the spinlock component of @mount_lock or * hold rcu_read_lock(), sample the seqcount component before the call * and recheck it afterwards. * * Return: The child of @mnt mounted on @dentry or %NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct hlist_head *head = m_hash(mnt, dentry); struct mount *p; hlist_for_each_entry_rcu(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) return p; return NULL; } /** * lookup_mnt - Return the child mount mounted at given location * @path: location in the namespace * * Acquires and returns a new reference to mount at given location * or %NULL if nothing is mounted there. */ struct vfsmount *lookup_mnt(const struct path *path) { struct mount *child_mnt; struct vfsmount *m; unsigned seq; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); child_mnt = __lookup_mnt(path->mnt, path->dentry); m = child_mnt ? &child_mnt->mnt : NULL; } while (!legitimize_mnt(m, seq)); rcu_read_unlock(); return m; } /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. * * The common case is dentries are not mountpoints at all and that * test is handled inline. For the slow case when we are actually * dealing with a mountpoint of some kind, walk through all of the * mounts in the current mount namespace and test to see if the dentry * is a mountpoint. * * The mount_hashtable is not usable in the context because we * need to identify all mounts that may be in the current mount * namespace not just a mount that happens to have some specified * parent mount. */ bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; guard(namespace_shared)(); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) if (mnt->mnt_mountpoint == dentry) return true; return false; } struct pinned_mountpoint { struct hlist_node node; struct mountpoint *mp; struct mount *parent; }; static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { hlist_add_head(&m->node, &mp->m_list); m->mp = mp; return true; } } return false; } static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { struct mountpoint *mp __free(kfree) = NULL; bool found; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) return -ENOENT; mountpoint: read_seqlock_excl(&mount_lock); found = lookup_mountpoint(dentry, m); read_sequnlock_excl(&mount_lock); if (found) return 0; } if (!mp) mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!mp) return -ENOMEM; /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); /* Someone else set d_mounted? */ if (ret == -EBUSY) goto mountpoint; /* The dentry is not available as a mountpoint? */ if (ret) return ret; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); mp->m_dentry = dget(dentry); hlist_add_head(&mp->m_hash, mp_hash(dentry)); INIT_HLIST_HEAD(&mp->m_list); hlist_add_head(&m->node, &mp->m_list); m->mp = no_free_ptr(mp); read_sequnlock_excl(&mount_lock); return 0; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list) { if (hlist_empty(&mp->m_list)) { struct dentry *dentry = mp->m_dentry; spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); dput_to_list(dentry, list); hlist_del(&mp->m_hash); kfree(mp); } } /* * locks: mount_lock [read_seqlock_excl], namespace_sem [excl] */ static void unpin_mountpoint(struct pinned_mountpoint *m) { if (m->mp) { hlist_del(&m->node); maybe_free_mountpoint(m->mp, &ex_mountpoints); } } static inline int check_mnt(const struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } static inline bool check_anonymous_mnt(struct mount *mnt) { u64 seq; if (!is_anon_ns(mnt->mnt_ns)) return false; seq = mnt->mnt_ns->seq_origin; return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); } /* * vfsmount lock must be held for write */ static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { ns->event = ++event; wake_up_interruptible(&ns->poll); } } /* * vfsmount lock must be held for write */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { ns->event = event; wake_up_interruptible(&ns->poll); } } /* * locks: mount_lock[write_seqlock] */ static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list) { struct mountpoint *mp; struct mount *parent = mnt->mnt_parent; if (unlikely(parent->overmount == mnt)) parent->overmount = NULL; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; maybe_free_mountpoint(mp, shrink_list); } /* * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints) */ static void umount_mnt(struct mount *mnt) { __umount_mnt(mnt, &ex_mountpoints); } /* * vfsmount lock must be held for write */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } static void make_visible(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root)) parent->overmount = mnt; hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /** * attach_mnt - mount a mount, attach to @mount_hashtable and parent's * list of child mounts * @parent: the parent * @mnt: the new mount * @mp: the new mountpoint * * Mount @mnt at @mp on @parent. Then attach @mnt * to @parent's child mount list and to @mount_hashtable. * * Note, when make_visible() is called @mnt->mnt_parent already points * to the correct parent. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void attach_mnt(struct mount *mnt, struct mount *parent, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); make_visible(mnt); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); attach_mnt(mnt, parent, mp); maybe_free_mountpoint(old_mp, &ex_mountpoints); } static inline struct mount *node_to_mount(struct rb_node *node) { return node ? rb_entry(node, struct mount, mnt_node) : NULL; } static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) { struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; bool mnt_first_node = true, mnt_last_node = true; WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) { link = &parent->rb_left; mnt_last_node = false; } else { link = &parent->rb_right; mnt_first_node = false; } } if (mnt_last_node) ns->mnt_last_node = &mnt->mnt_node; if (mnt_first_node) ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); mnt_notify_add(mnt); } static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { while (1) { if (p == root) return NULL; next = p->mnt_child.next; if (next != &p->mnt_parent->mnt_mounts) break; p = p->mnt_parent; } } return list_entry(next, struct mount, mnt_child); } static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; } /* * vfsmount lock must be held for write */ static void commit_tree(struct mount *mnt) { struct mnt_namespace *n = mnt->mnt_parent->mnt_ns; if (!mnt_ns_attached(mnt)) { for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) mnt_add_to_ns(n, m); n->nr_mounts += n->pending_mounts; n->pending_mounts = 0; } make_visible(mnt); touch_mnt_namespace(n); } static void setup_mnt(struct mount *m, struct dentry *root) { struct super_block *s = root->d_sb; atomic_inc(&s->s_active); m->mnt.mnt_sb = s; m->mnt.mnt_root = dget(root); m->mnt_mountpoint = m->mnt.mnt_root; m->mnt_parent = m; guard(mount_locked_reader)(); mnt_add_instance(m, s); } /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached * * Create a mount to an already configured superblock. If necessary, the * caller should invoke vfs_get_tree() before calling this. * * Note that this does not attach the mount to anything. */ struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; if (!fc->root) return ERR_PTR(-EINVAL); mnt = alloc_vfsmnt(fc->source); if (!mnt) return ERR_PTR(-ENOMEM); if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; setup_mnt(mnt, fc->root); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); struct vfsmount *fc_mount(struct fs_context *fc) { int err = vfs_get_tree(fc); if (!err) { up_write(&fc->root->d_sb->s_umount); return vfs_create_mount(fc); } return ERR_PTR(err); } EXPORT_SYMBOL(fc_mount); struct vfsmount *fc_mount_longterm(struct fs_context *fc) { struct vfsmount *mnt = fc_mount(fc); if (!IS_ERR(mnt)) real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; return mnt; } EXPORT_SYMBOL(fc_mount_longterm); struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct fs_context *fc; struct vfsmount *mnt; int ret = 0; if (!type) return ERR_PTR(-EINVAL); fc = fs_context_for_mount(type, flags); if (IS_ERR(fc)) return ERR_CAST(fc); if (name) ret = vfs_parse_fs_string(fc, "source", name); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) mnt = fc_mount(fc); else mnt = ERR_PTR(ret); put_fs_context(fc); return mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { struct mount *mnt; int err; mnt = alloc_vfsmnt(old->mnt_devname); if (!mnt) return ERR_PTR(-ENOMEM); mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) & ~MNT_INTERNAL_FLAGS; if (flag & (CL_SLAVE | CL_PRIVATE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { err = mnt_alloc_group_id(mnt); if (err) goto out_free; } if (mnt->mnt_group_id) set_mnt_shared(mnt); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); setup_mnt(mnt, root); if (flag & CL_PRIVATE) // we are done with it return mnt; if (peers(mnt, old)) list_add(&mnt->mnt_share, &old->mnt_share); if ((flag & CL_SLAVE) && old->mnt_group_id) { hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; } else if (IS_MNT_SLAVE(old)) { hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; } return mnt; out_free: mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } static void cleanup_mnt(struct mount *mnt) { struct hlist_node *p; struct mount *m; /* * The warning here probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this happens, the * filesystem was probably unable to make r/w->r/o transitions. * The locking used to deal with mnt_count decrement provides barriers, * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } static void __cleanup_mnt(struct rcu_head *head) { cleanup_mnt(container_of(head, struct mount, mnt_rcu)); } static LLIST_HEAD(delayed_mntput_list); static void delayed_mntput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_mntput_list); struct mount *m, *t; llist_for_each_entry_safe(m, t, node, mnt_llist) cleanup_mnt(m); } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { LIST_HEAD(list); int count; rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { /* * Since we don't do lock_mount_hash() here, * ->mnt_ns can change under us. However, if it's * non-NULL, then there's a reference that won't * be dropped until after an RCU delay done after * turning ->mnt_ns NULL. So if we observe it * non-NULL under rcu_read_lock(), the reference * we are dropping is not the final one. */ mnt_add_count(mnt, -1); rcu_read_unlock(); return; } lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab * mount_lock, we'll see their refcount increment here. */ smp_mb(); mnt_add_count(mnt, -1); count = mnt_get_count(mnt); if (count != 0) { WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); return; } mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { __umount_mnt(p, &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } unlock_mount_hash(); shrink_dentry_list(&list); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) schedule_delayed_work(&delayed_mntput_work, 1); return; } cleanup_mnt(mnt); } void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); /* avoid cacheline pingpong */ if (unlikely(m->mnt_expiry_mark)) WRITE_ONCE(m->mnt_expiry_mark, 0); mntput_no_expire(m); } } EXPORT_SYMBOL(mntput); struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) mnt_add_count(real_mount(mnt), 1); return mnt; } EXPORT_SYMBOL(mntget); /* * Make a mount point inaccessible to new lookups. * Because there may still be current users, the caller MUST WAIT * for an RCU grace period before destroying the mount point. */ void mnt_make_shortterm(struct vfsmount *mnt) { if (mnt) real_mount(mnt)->mnt_ns = NULL; } /** * path_is_mountpoint() - Check if path is a mount in the current namespace. * @path: path to check * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. * d_mountpoint() isn't aware of the possibility there may be multiple * mounts using a given dentry in a different namespace. This function * checks if the passed in path is a mountpoint rather than the dentry * alone. */ bool path_is_mountpoint(const struct path *path) { unsigned seq; bool res; if (!d_mountpoint(path->dentry)) return false; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); res = __path_is_mountpoint(path); } while (read_seqretry(&mount_lock, seq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL(path_is_mountpoint); struct vfsmount *mnt_clone_internal(const struct path *path) { struct mount *p; p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); if (IS_ERR(p)) return ERR_CAST(p); p->mnt.mnt_flags |= MNT_INTERNAL; return &p->mnt; } /* * Returns the mount which either has the specified mnt_id, or has the next * smallest id afer the specified one. */ static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) { struct rb_node *node = ns->mounts.rb_node; struct mount *ret = NULL; while (node) { struct mount *m = node_to_mount(node); if (mnt_id <= m->mnt_id_unique) { ret = node_to_mount(node); if (mnt_id == m->mnt_id_unique) break; node = node->rb_left; } else { node = node->rb_right; } } return ret; } /* * Returns the mount which either has the specified mnt_id, or has the next * greater id before the specified one. */ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id) { struct rb_node *node = ns->mounts.rb_node; struct mount *ret = NULL; while (node) { struct mount *m = node_to_mount(node); if (mnt_id >= m->mnt_id_unique) { ret = node_to_mount(node); if (mnt_id == m->mnt_id_unique) break; node = node->rb_right; } else { node = node->rb_left; } } return ret; } #ifdef CONFIG_PROC_FS /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; down_read(&namespace_sem); return mnt_find_id_at(p->ns, *pos); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct mount *next = NULL, *mnt = v; struct rb_node *node = rb_next(&mnt->mnt_node); ++*pos; if (node) { next = node_to_mount(node); *pos = next->mnt_id_unique; } return next; } static void m_stop(struct seq_file *m, void *v) { up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; struct mount *r = v; return p->show(m, &r->mnt); } const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = m_show, }; #endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy * @m: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are * busy. */ int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); bool busy = false; /* write lock needed for mnt_get_count */ lock_mount_hash(); for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) { if (mnt_get_count(p) > (p == mnt ? 2 : 1)) { busy = true; break; } } unlock_mount_hash(); return !busy; } EXPORT_SYMBOL(may_umount_tree); /** * may_umount - check if a mount point is busy * @mnt: root of mount * * This is called to check if a mount point has any * open files, pwds, chroots or sub mounts. If the * mount has sub mounts this will return busy * regardless of whether the sub mounts are busy. * * Doesn't take quota and stuff into account. IOW, in some cases it will * give false negatives. The main reason why it's here is that we need * a non-destructive way to look for easily umountable filesystems. */ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); lock_mount_hash(); if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; unlock_mount_hash(); up_read(&namespace_sem); return ret; } EXPORT_SYMBOL(may_umount); #ifdef CONFIG_FSNOTIFY static void mnt_notify(struct mount *p) { if (!p->prev_ns && p->mnt_ns) { fsnotify_mnt_attach(p->mnt_ns, &p->mnt); } else if (p->prev_ns && !p->mnt_ns) { fsnotify_mnt_detach(p->prev_ns, &p->mnt); } else if (p->prev_ns == p->mnt_ns) { fsnotify_mnt_move(p->mnt_ns, &p->mnt); } else { fsnotify_mnt_detach(p->prev_ns, &p->mnt); fsnotify_mnt_attach(p->mnt_ns, &p->mnt); } p->prev_ns = p->mnt_ns; } static void notify_mnt_list(void) { struct mount *m, *tmp; /* * Notify about mounts that were added/reparented/detached/remain * connected after unmount. */ list_for_each_entry_safe(m, tmp, &notify_list, to_notify) { mnt_notify(m); list_del_init(&m->to_notify); } } static bool need_notify_mnt_list(void) { return !list_empty(&notify_list); } #else static void notify_mnt_list(void) { } static bool need_notify_mnt_list(void) { return false; } #endif static void free_mnt_ns(struct mnt_namespace *); static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; struct mnt_namespace *ns = emptied_ns; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); emptied_ns = NULL; if (need_notify_mnt_list()) { /* * No point blocking out concurrent readers while notifications * are sent. This will also allow statmount()/listmount() to run * concurrently. */ downgrade_write(&namespace_sem); notify_mnt_list(); up_read(&namespace_sem); } else { up_write(&namespace_sem); } if (unlikely(ns)) { /* Make sure we notice when we leak mounts. */ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); free_mnt_ns(ns); } shrink_dentry_list(&list); if (likely(hlist_empty(&head))) return; synchronize_rcu_expedited(); hlist_for_each_entry_safe(m, p, &head, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } } static inline void namespace_lock(void) { down_write(&namespace_sem); } enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, UMOUNT_CONNECTED = 4, }; static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) { /* Leaving mounts connected is only valid for lazy umounts */ if (how & UMOUNT_SYNC) return true; /* A mount without a parent has nothing to be connected to */ if (!mnt_has_parent(mnt)) return true; /* Because the reference counting rules change when mounts are * unmounted and connected, umounted mounts may not be * connected to mounted mounts. */ if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) return true; /* Has it been requested that the mount remain connected? */ if (how & UMOUNT_CONNECTED) return false; /* Is the mount locked such that it needs to remain connected? */ if (IS_MNT_LOCKED(mnt)) return false; /* By default disconnect the mount */ return true; } /* * mount_lock must be held * namespace_sem must be held for write */ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { LIST_HEAD(tmp_list); struct mount *p; if (how & UMOUNT_PROPAGATE) propagate_mount_unlock(mnt); /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; if (mnt_ns_attached(p)) move_from_ns(p); list_add_tail(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child); } /* Add propagated mounts to the tmp_list */ if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); bulk_make_private(&tmp_list); while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); ns = p->mnt_ns; if (ns) { ns->nr_mounts--; __touch_mnt_namespace(ns); } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); } else { umount_mnt(p); } } if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); /* * At this point p->mnt_ns is NULL, notification will be queued * only if * * - p->prev_ns is non-NULL *and* * - p->prev_ns->n_fsnotify_marks is non-NULL * * This will preclude queuing the mount if this is a cleanup * after a failed copy_tree() or destruction of an anonymous * namespace, etc. */ mnt_notify_add(p); } } static void shrink_submounts(struct mount *mnt); static int do_umount_root(struct super_block *sb) { int ret = 0; down_write(&sb->s_umount); if (!sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, SB_RDONLY); if (IS_ERR(fc)) { ret = PTR_ERR(fc); } else { ret = parse_monolithic_mount_data(fc, NULL); if (!ret) ret = reconfigure_super(fc); put_fs_context(fc); } } up_write(&sb->s_umount); return ret; } static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; int retval; retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; /* * Allow userspace to request a mountpoint be expired rather than * unmounting unconditionally. Unmount only happens if: * (1) the mark is already set (the mark is cleared by mntput()) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; /* * probably don't strictly need the lock here if we examined * all race cases, but it's a slowpath. */ lock_mount_hash(); if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } unlock_mount_hash(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; } /* * If we may have to abort operations to get out of this * mount, and they will themselves hold resources we must * allow the fs to do things. In the Unix tradition of * 'Gee thats tricky lets do it in userspace' the umount_begin * might fail to complete on the first run through as other tasks * must return, and the like. Thats for the mount program to worry * about for the moment. */ if (flags & MNT_FORCE && sb->s_op->umount_begin) { sb->s_op->umount_begin(sb); } /* * No sense to grab the lock for this test, but test itself looks * somewhat bogus. Suggestions for better replacement? * Ho-hum... In principle, we might treat that as umount + switch * to rootfs. GC would eventually take care of the old vfsmount. * Actually it makes sense, especially if rootfs would contain a * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return do_umount_root(sb); } namespace_lock(); lock_mount_hash(); /* Repeat the earlier racy checks, now that we are holding the locks */ retval = -EINVAL; if (!check_mnt(mnt)) goto out; if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; if (!mnt_has_parent(mnt)) /* not the absolute root */ goto out; event++; if (flags & MNT_DETACH) { umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } out: unlock_mount_hash(); namespace_unlock(); return retval; } /* * __detach_mounts - lazily unmount all mounts on the specified dentry * * During unlink, rmdir, and d_drop it is possible to loose the path * to an existing mountpoint, and wind up leaking the mount. * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * * The caller may hold dentry->d_inode->i_rwsem. */ void __detach_mounts(struct dentry *dentry) { struct pinned_mountpoint mp = {}; struct mount *mnt; guard(namespace_excl)(); guard(mount_writer)(); if (!lookup_mountpoint(dentry, &mp)) return; event++; while (mp.node.next) { mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } unpin_mountpoint(&mp); } /* * Is the caller allowed to modify his namespace? */ bool may_mount(void) { return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } static void warn_mandlock(void) { pr_warn_once("=======================================================\n" "WARNING: The mand mount option has been deprecated and\n" " and is ignored by this kernel. Remove the mand\n" " option from the mount to silence this warning.\n" "=======================================================\n"); } static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); struct super_block *sb = path->dentry->d_sb; if (!may_mount()) return -EPERM; if (!path_mounted(path)) return -EINVAL; if (!check_mnt(mnt)) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } // caller is responsible for flags being sane int path_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; ret = can_umount(path, flags); if (!ret) ret = do_umount(mnt, flags); /* we mustn't call path_put() as that would clear mnt_expiry_mark */ dput(path->dentry); mntput_no_expire(mnt); return ret; } static int ksys_umount(char __user *name, int flags) { int lookup_flags = LOOKUP_MOUNTPOINT; struct path path; int ret; // basic validity checks done first if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (ret) return ret; return path_umount(&path, flags); } SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { return ksys_umount(name, flags); } #ifdef __ARCH_WANT_SYS_OLDUMOUNT /* * The 2.0 compatible umount. No flags. */ SYSCALL_DEFINE1(oldumount, char __user *, name) { return ksys_umount(name, 0); } #endif static bool is_mnt_ns_file(struct dentry *dentry) { struct ns_common *ns; /* Is this a proxy for a mount namespace? */ if (dentry->d_op != &ns_dentry_operations) return false; ns = d_inode(dentry)->i_private; return ns->ops == &mntns_operations; } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) { return &mnt->ns; } struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { struct ns_common *ns; guard(rcu)(); for (;;) { ns = ns_tree_adjoined_rcu(mntns, previous); if (IS_ERR(ns)) return ERR_CAST(ns); mntns = to_mnt_ns(ns); /* * The last passive reference count is put with RCU * delay so accessing the mount namespace is not just * safe but all relevant members are still valid. */ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) continue; /* * We need an active reference count as we're persisting * the mount namespace and it might already be on its * deathbed. */ if (!ns_ref_get(mntns)) continue; return mntns; } } struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) { if (!is_mnt_ns_file(dentry)) return NULL; return to_mnt_ns(get_proc_ns(dentry->d_inode)); } static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); if (!mnt_ns) return false; return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, int flag) { struct mount *res, *src_parent, *src_root_child, *src_mnt, *dst_parent, *dst_mnt; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root)) return ERR_PTR(-EINVAL); if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); res = dst_mnt = clone_mnt(src_root, dentry, flag); if (IS_ERR(dst_mnt)) return dst_mnt; src_parent = src_root; list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) continue; for (src_mnt = src_root_child; src_mnt; src_mnt = next_mnt(src_mnt, src_root_child)) { if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_mnt)) { if (src_mnt->mnt.mnt_flags & MNT_LOCKED) { /* Both unbindable and locked. */ dst_mnt = ERR_PTR(-EPERM); goto out; } else { src_mnt = skip_mnt_tree(src_mnt); continue; } } if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(src_mnt->mnt.mnt_root)) { src_mnt = skip_mnt_tree(src_mnt); continue; } while (src_parent != src_mnt->mnt_parent) { src_parent = src_parent->mnt_parent; dst_mnt = dst_mnt->mnt_parent; } src_parent = src_mnt; dst_parent = dst_mnt; dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag); if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); if (src_mnt->mnt.mnt_flags & MNT_LOCKED) dst_mnt->mnt.mnt_flags |= MNT_LOCKED; if (unlikely(flag & CL_EXPIRE)) { /* stick the duplicate mount on the same expiry * list as the original if that was on one */ if (!list_empty(&src_mnt->mnt_expire)) list_add(&dst_mnt->mnt_expire, &src_mnt->mnt_expire); } attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp); unlock_mount_hash(); } } return res; out: if (res) { lock_mount_hash(); umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } return dst_mnt; } static inline bool extend_array(struct path **res, struct path **to_free, unsigned n, unsigned *count, unsigned new_count) { struct path *p; if (likely(n < *count)) return true; p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); if (p && *count) memcpy(p, *res, *count * sizeof(struct path)); *count = new_count; kfree(*to_free); *to_free = *res = p; return p; } const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); struct mount *child; struct path *res = prealloc, *to_free = NULL; unsigned n = 0; guard(namespace_shared)(); if (!check_mnt(root)) return ERR_PTR(-EINVAL); if (!extend_array(&res, &to_free, 0, &count, 32)) return ERR_PTR(-ENOMEM); res[n++] = *path; list_for_each_entry(child, &root->mnt_mounts, mnt_child) { if (!is_subdir(child->mnt_mountpoint, path->dentry)) continue; for (struct mount *m = child; m; m = next_mnt(m, child)) { if (!extend_array(&res, &to_free, n, &count, 2 * count)) return ERR_PTR(-ENOMEM); res[n].mnt = &m->mnt; res[n].dentry = m->mnt.mnt_root; n++; } } if (!extend_array(&res, &to_free, n, &count, count + 1)) return ERR_PTR(-ENOMEM); memset(res + n, 0, (count - n) * sizeof(struct path)); for (struct path *p = res; p->mnt; p++) path_get(p); return res; } void drop_collected_paths(const struct path *paths, const struct path *prealloc) { for (const struct path *p = paths; p->mnt; p++) path_put(p); if (paths != prealloc) kfree(paths); } static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); void dissolve_on_fput(struct vfsmount *mnt) { struct mount *m = real_mount(mnt); /* * m used to be the root of anon namespace; if it still is one, * we need to dissolve the mount tree and free that namespace. * Let's try to avoid taking namespace_sem if we can determine * that there's nothing to do without it - rcu_read_lock() is * enough to make anon_ns_root() memory-safe and once m has * left its namespace, it's no longer our concern, since it will * never become a root of anon ns again. */ scoped_guard(rcu) { if (!anon_ns_root(m)) return; } scoped_guard(namespace_excl) { if (!anon_ns_root(m)) return; emptied_ns = m->mnt_ns; lock_mount_hash(); umount_tree(m, UMOUNT_CONNECTED); unlock_mount_hash(); } } /* locks: namespace_shared && pinned(mnt) || mount_locked_reader */ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (!is_subdir(child->mnt_mountpoint, dentry)) continue; if (child->mnt.mnt_flags & MNT_LOCKED) return true; } return false; } bool has_locked_children(struct mount *mnt, struct dentry *dentry) { guard(mount_locked_reader)(); return __has_locked_children(mnt, dentry); } /* * Check that there aren't references to earlier/same mount namespaces in the * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. * * locks: mount_locked_reader || namespace_shared && pinned(subtree) */ static bool check_for_nsfs_mounts(struct mount *subtree) { for (struct mount *p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) return false; return true; } /** * clone_private_mount - create a private clone of a path * @path: path to clone * * This creates a new vfsmount, which will be the clone of @path. The new mount * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * * This assumes caller has called or done the equivalent of may_mount(). * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) { struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; guard(namespace_shared)(); if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); /* * Make sure the source mount is acceptable. * Anything mounted in our mount namespace is allowed. * Otherwise, it must be the root of an anonymous mount * namespace, and we need to make sure no namespace * loops get created. */ if (!check_mnt(old_mnt)) { if (!anon_ns_root(old_mnt)) return ERR_PTR(-EINVAL); if (!check_for_nsfs_mounts(old_mnt)) return ERR_PTR(-EINVAL); } if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (__has_locked_children(old_mnt, path->dentry)) return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); if (IS_ERR(new_mnt)) return ERR_PTR(-EINVAL); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; return &new_mnt->mnt; } EXPORT_SYMBOL_GPL(clone_private_mount); static void lock_mnt_tree(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { int flags = p->mnt.mnt_flags; /* Don't allow unprivileged users to change mount flags */ flags |= MNT_LOCK_ATIME; if (flags & MNT_READONLY) flags |= MNT_LOCK_READONLY; if (flags & MNT_NODEV) flags |= MNT_LOCK_NODEV; if (flags & MNT_NOSUID) flags |= MNT_LOCK_NOSUID; if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ if (list_empty(&p->mnt_expire) && p != mnt) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } } static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; for (p = mnt; p != end; p = next_mnt(p, mnt)) { if (p->mnt_group_id && !IS_MNT_SHARED(p)) mnt_release_group_id(p); } } static int invent_group_ids(struct mount *mnt, bool recurse) { struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); return err; } } } return 0; } int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { unsigned int max = READ_ONCE(sysctl_mount_max); unsigned int mounts = 0; struct mount *p; if (ns->nr_mounts >= max) return -ENOSPC; max -= ns->nr_mounts; if (ns->pending_mounts >= max) return -ENOSPC; max -= ns->pending_mounts; for (p = mnt; p; p = next_mnt(p, mnt)) mounts++; if (mounts > max) return -ENOSPC; ns->pending_mounts += mounts; return 0; } enum mnt_tree_flags_t { MNT_TREE_BENEATH = BIT(0), MNT_TREE_PROPAGATION = BIT(1), }; /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached * @dest: the context for mounting at the place where the tree should go * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. * --------------------------------------------------------------------------- * | BIND MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (++) | shared (+) | shared(+++)| invalid | * | | | | | | * |non-shared| shared (+) | private | slave (*) | invalid | * *************************************************************************** * A bind operation clones the source mount and mounts the clone on the * destination mount. * * (++) the cloned mount is propagated to all the mounts in the propagation * tree of the destination mount and the cloned mount is added to * the peer group of the source mount. * (+) the cloned mount is created under the destination mount and is marked * as shared. The cloned mount is added to the peer group of the source * mount. * (+++) the mount is propagated to all the mounts in the propagation tree * of the destination mount and the cloned mount is made slave * of the same master as that of the source mount. The cloned mount * is marked as 'shared and slave'. * (*) the cloned mount is made a slave of the same master as that of the * source mount. * * --------------------------------------------------------------------------- * | MOVE MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (+) | shared (+) | shared(+++) | invalid | * | | | | | | * |non-shared| shared (+*) | private | slave (*) | unbindable | * *************************************************************************** * * (+) the mount is moved to the destination. And is then propagated to * all the mounts in the propagation tree of the destination mount. * (+*) the mount is moved to the destination. * (+++) the mount is moved to the destination and is then propagated to * all the mounts belonging to the destination mount's propagation tree. * the mount is marked as 'shared and slave'. * (*) the mount continues to be a slave at the new location. * * if the source mount is a tree, the operations explained above is * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. * * Context: The function expects namespace_lock() to be held. * Return: If @source_mnt was successfully attached 0 is returned. * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, const struct pinned_mountpoint *dest) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *dest_mnt = dest->parent; struct mountpoint *dest_mp = dest->mp; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct pinned_mountpoint root = {}; struct mountpoint *shorter = NULL; struct mount *child, *p; struct mount *top; struct hlist_node *n; int err = 0; bool moving = mnt_has_parent(source_mnt); /* * Preallocate a mountpoint in case the new mounts need to be * mounted beneath mounts on the same mountpoint. */ for (top = source_mnt; unlikely(top->overmount); top = top->overmount) { if (!shorter && is_mnt_ns_file(top->mnt.mnt_root)) shorter = top->mnt_mp; } err = get_mountpoint(top->mnt.mnt_root, &root); if (err) return err; /* Is there space to add these mounts to the mount namespace? */ if (!moving) { err = count_mounts(ns, source_mnt); if (err) goto out; } if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); } lock_mount_hash(); if (err) goto out_cleanup_ids; if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } if (moving) { umount_mnt(source_mnt); mnt_notify_add(source_mnt); /* if the mount is moved, it should no longer be expired * automatically */ list_del_init(&source_mnt->mnt_expire); } else { if (source_mnt->mnt_ns) { /* move from anon - the caller will destroy */ emptied_ns = source_mnt->mnt_ns; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) move_from_ns(p); } } mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); /* * Now the original copy is in the same state as the secondaries - * its root attached to mountpoint, but not hashed and all mounts * in it are either in our namespace or in no namespace at all. * Add the original to the list of copies and deal with the * rest of work for all of them uniformly. */ hlist_add_head(&source_mnt->mnt_hash, &tree_list); hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); commit_tree(child); if (q) { struct mount *r = topmost_overmount(child); struct mountpoint *mp = root.mp; if (unlikely(shorter) && child != source_mnt) mp = shorter; mnt_change_mountpoint(r, mp, q); } } unpin_mountpoint(&root); unlock_mount_hash(); return 0; out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); unpin_mountpoint(&root); read_sequnlock_excl(&mount_lock); return err; } static inline struct mount *where_to_mount(const struct path *path, struct dentry **dentry, bool beneath) { struct mount *m; if (unlikely(beneath)) { m = topmost_overmount(real_mount(path->mnt)); *dentry = m->mnt_mountpoint; return m->mnt_parent; } m = __lookup_mnt(path->mnt, path->dentry); if (unlikely(m)) { m = topmost_overmount(m); *dentry = m->mnt.mnt_root; return m; } *dentry = path->dentry; return real_mount(path->mnt); } /** * do_lock_mount - acquire environment for mounting * @path: target path * @res: context to set up * @beneath: whether the intention is to mount beneath @path * * To mount something at given location, we need * namespace_sem locked exclusive * inode of dentry we are mounting on locked exclusive * struct mountpoint for that dentry * struct mount we are mounting on * * Results are stored in caller-supplied context (pinned_mountpoint); * on success we have res->parent and res->mp pointing to parent and * mountpoint respectively and res->node inserted into the ->m_list * of the mountpoint, making sure the mountpoint won't disappear. * On failure we have res->parent set to ERR_PTR(-E...), res->mp * left NULL, res->node - empty. * In case of success do_lock_mount returns with locks acquired (in * proper order - inode lock nests outside of namespace_sem). * * Request to mount on overmounted location is treated as "mount on * top of whatever's overmounting it"; request to mount beneath * a location - "mount immediately beneath the topmost mount at that * place". * * In all cases the location must not have been unmounted and the * chosen mountpoint must be allowed to be mounted on. For "beneath" * case we also require the location to be at the root of a mount * that has a parent (i.e. is not a root of some namespace). */ static void do_lock_mount(const struct path *path, struct pinned_mountpoint *res, bool beneath) { int err; if (unlikely(beneath) && !path_mounted(path)) { res->parent = ERR_PTR(-EINVAL); return; } do { struct dentry *dentry, *d; struct mount *m, *n; scoped_guard(mount_locked_reader) { m = where_to_mount(path, &dentry, beneath); if (&m->mnt != path->mnt) { mntget(&m->mnt); dget(dentry); } } inode_lock(dentry->d_inode); namespace_lock(); // check if the chain of mounts (if any) has changed. scoped_guard(mount_locked_reader) n = where_to_mount(path, &d, beneath); if (unlikely(n != m || dentry != d)) err = -EAGAIN; // something moved, retry else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt))) err = -ENOENT; // not to be mounted on else if (beneath && &m->mnt == path->mnt && !m->overmount) err = -EINVAL; else err = get_mountpoint(dentry, res); if (unlikely(err)) { res->parent = ERR_PTR(err); namespace_unlock(); inode_unlock(dentry->d_inode); } else { res->parent = m; } /* * Drop the temporary references. This is subtle - on success * we are doing that under namespace_sem, which would normally * be forbidden. However, in that case we are guaranteed that * refcounts won't reach zero, since we know that path->mnt * is mounted and thus all mounts reachable from it are pinned * and stable, along with their mountpoints and roots. */ if (&m->mnt != path->mnt) { dput(dentry); mntput(&m->mnt); } } while (err == -EAGAIN); } static void __unlock_mount(struct pinned_mountpoint *m) { inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); unpin_mountpoint(m); read_sequnlock_excl(&mount_lock); namespace_unlock(); } static inline void unlock_mount(struct pinned_mountpoint *m) { if (!IS_ERR(m->parent)) __unlock_mount(m); } #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ do_lock_mount((path), &mp, (beneath)) #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) #define LOCK_MOUNT_EXACT(mp, path) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ lock_mount_exact((path), &mp) static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; if (d_is_dir(mp->mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; return attach_recursive_mnt(mnt, mp); } static int may_change_propagation(const struct mount *m) { struct mnt_namespace *ns = m->mnt_ns; // it must be mounted in some namespace if (IS_ERR_OR_NULL(ns)) // is_mounted() return -EINVAL; // and the caller must be admin in userns of that namespace if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /* * Sanity check the flags to change_mnt_propagation. */ static int flags_to_propagation_type(int ms_flags) { int type = ms_flags & ~(MS_REC | MS_SILENT); /* Fail if any non-propagation flags are set */ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return 0; /* Only one propagation flag should be set */ if (!is_power_of_2(type)) return 0; return type; } /* * recursively change the type of the mountpoint. */ static int do_change_type(const struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; int err; if (!path_mounted(path)) return -EINVAL; type = flags_to_propagation_type(ms_flags); if (!type) return -EINVAL; guard(namespace_excl)(); err = may_change_propagation(mnt); if (err) return err; if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) return err; } for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); return 0; } /* may_copy_tree() - check if a mount tree can be copied * @path: path to the mount tree to be copied * * This helper checks if the caller may copy the mount tree starting * from @path->mnt. The caller may copy the mount tree under the * following circumstances: * * (1) The caller is located in the mount namespace of the mount tree. * This also implies that the mount does not belong to an anonymous * mount namespace. * (2) The caller tries to copy an nfs mount referring to a mount * namespace, i.e., the caller is trying to copy a mount namespace * entry from nsfs. * (3) The caller tries to copy a pidfs mount referring to a pidfd. * (4) The caller is trying to copy a mount tree that belongs to an * anonymous mount namespace. * * For that to be safe, this helper enforces that the origin mount * namespace the anonymous mount namespace was created from is the * same as the caller's mount namespace by comparing the sequence * numbers. * * This is not strictly necessary. The current semantics of the new * mount api enforce that the caller must be located in the same * mount namespace as the mount tree it interacts with. Using the * origin sequence number preserves these semantics even for * anonymous mount namespaces. However, one could envision extending * the api to directly operate across mount namespace if needed. * * The ownership of a non-anonymous mount namespace such as the * caller's cannot change. * => We know that the caller's mount namespace is stable. * * If the origin sequence number of the anonymous mount namespace is * the same as the sequence number of the caller's mount namespace. * => The owning namespaces are the same. * * ==> The earlier capability check on the owning namespace of the * caller's mount namespace ensures that the caller has the * ability to copy the mount tree. * * Returns true if the mount tree can be copied, false otherwise. */ static inline bool may_copy_tree(const struct path *path) { struct mount *mnt = real_mount(path->mnt); const struct dentry_operations *d_op; if (check_mnt(mnt)) return true; d_op = path->dentry->d_op; if (d_op == &ns_dentry_operations) return true; if (d_op == &pidfs_dentry_operations) return true; if (!is_mounted(path->mnt)) return false; return check_anonymous_mnt(mnt); } static struct mount *__do_loopback(const struct path *old_path, int recurse) { struct mount *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) return ERR_PTR(-EINVAL); if (!may_copy_tree(old_path)) return ERR_PTR(-EINVAL); if (!recurse && __has_locked_children(old, old_path->dentry)) return ERR_PTR(-EINVAL); if (recurse) return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else return clone_mnt(old, old_path->dentry, 0); } /* * do loopback mount. */ static int do_loopback(const struct path *path, const char *old_name, int recurse) { struct path old_path __free(path_put) = {}; struct mount *mnt = NULL; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); if (err) return err; if (mnt_ns_loop(old_path.dentry)) return -EINVAL; LOCK_MOUNT(mp, path); if (IS_ERR(mp.parent)) return PTR_ERR(mp.parent); if (!check_mnt(mp.parent)) return -EINVAL; mnt = __do_loopback(&old_path, recurse); if (IS_ERR(mnt)) return PTR_ERR(mnt); err = graft_tree(mnt, &mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } return err; } static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) return ns; guard(namespace_excl)(); /* * Record the sequence number of the source mount namespace. * This needs to hold namespace_sem to ensure that the mount * doesn't get attached. */ if (is_mounted(path->mnt)) { src_mnt_ns = real_mount(path->mnt)->mnt_ns; if (is_anon_ns(src_mnt_ns)) ns->seq_origin = src_mnt_ns->seq_origin; else ns->seq_origin = src_mnt_ns->ns.ns_id; } mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { emptied_ns = ns; return ERR_CAST(mnt); } for (p = mnt; p; p = next_mnt(p, mnt)) { mnt_add_to_ns(ns, p); ns->nr_mounts++; } ns->root = mnt; return ns; } static struct file *open_detached_copy(struct path *path, bool recursive) { struct mnt_namespace *ns = get_detached_copy(path, recursive); struct file *file; if (IS_ERR(ns)) return ERR_CAST(ns); mntput(path->mnt); path->mnt = mntget(&ns->root->mnt); file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); else file->f_mode |= FMODE_NEED_UNMOUNT; return file; } static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { int ret; struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) return ERR_PTR(-EINVAL); if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) return ERR_PTR(-EPERM); ret = user_path_at(dfd, filename, lookup_flags, &path); if (unlikely(ret)) return ERR_PTR(ret); if (detached) return open_detached_copy(&path, flags & AT_RECURSIVE); return dentry_open(&path, O_PATH, current_cred()); } SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { int fd; struct file *file __free(fput) = NULL; file = vfs_open_tree(dfd, filename, flags); if (IS_ERR(file)) return PTR_ERR(file); fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; fd_install(fd, no_free_ptr(file)); return fd; } /* * Don't allow locked mount flags to be cleared. * * No locks need to be held here while testing the various MNT_LOCK * flags because those flags can never be cleared once they are set. */ static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags) { unsigned int fl = mnt->mnt.mnt_flags; if ((fl & MNT_LOCK_READONLY) && !(mnt_flags & MNT_READONLY)) return false; if ((fl & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) return false; if ((fl & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) return false; if ((fl & MNT_LOCK_NOEXEC) && !(mnt_flags & MNT_NOEXEC)) return false; if ((fl & MNT_LOCK_ATIME) && ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) return false; return true; } static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) { bool readonly_request = (mnt_flags & MNT_READONLY); if (readonly_request == __mnt_is_readonly(&mnt->mnt)) return 0; if (readonly_request) return mnt_make_readonly(mnt); mnt->mnt.mnt_flags &= ~MNT_READONLY; return 0; } static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); } static void mnt_warn_timestamp_expiry(const struct path *mountpoint, struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf, *mntpath; buf = (char *)__get_free_page(GFP_KERNEL); if (buf) mntpath = d_path(mountpoint, buf, PAGE_SIZE); else mntpath = ERR_PTR(-ENOMEM); if (IS_ERR(mntpath)) mntpath = "(unknown)"; pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, is_mounted(mnt) ? "remounted" : "mounted", mntpath, &sb->s_time_max, (unsigned long long)sb->s_time_max); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; if (buf) free_page((unsigned long)buf); } } /* * Handle reconfiguration of the mountpoint only without alteration of the * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); int ret; if (!check_mnt(mnt)) return -EINVAL; if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; /* * We're only checking whether the superblock is read-only not * changing it, so only take down_read(&sb->s_umount). */ down_read(&sb->s_umount); lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); up_read(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); return ret; } /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ static int do_remount(const struct path *path, int sb_flags, int mnt_flags, void *data) { int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); /* * Indicate to the filesystem that the remount request is coming * from the legacy mount system call. */ fc->oldapi = true; err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); if (!err) { lock_mount_hash(); set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); } } up_write(&sb->s_umount); } mnt_warn_timestamp_expiry(path, &mnt->mnt); put_fs_context(fc); return err; } static inline int tree_contains_unbindable(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; } return 0; } static int do_set_group(const struct path *from_path, const struct path *to_path) { struct mount *from = real_mount(from_path->mnt); struct mount *to = real_mount(to_path->mnt); int err; guard(namespace_excl)(); err = may_change_propagation(from); if (err) return err; err = may_change_propagation(to); if (err) return err; /* To and From paths should be mount roots */ if (!path_mounted(from_path)) return -EINVAL; if (!path_mounted(to_path)) return -EINVAL; /* Setting sharing groups is only allowed across same superblock */ if (from->mnt.mnt_sb != to->mnt.mnt_sb) return -EINVAL; /* From mount root should be wider than To mount root */ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) return -EINVAL; /* From mount should not have locked children in place of To's root */ if (__has_locked_children(from, to->mnt.mnt_root)) return -EINVAL; /* Setting sharing groups is only allowed on private mounts */ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) return -EINVAL; /* From should not be private */ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) return -EINVAL; if (IS_MNT_SLAVE(from)) { hlist_add_behind(&to->mnt_slave, &from->mnt_slave); to->mnt_master = from->mnt_master; } if (IS_MNT_SHARED(from)) { to->mnt_group_id = from->mnt_group_id; list_add(&to->mnt_share, &from->mnt_share); set_mnt_shared(to); } return 0; } /** * path_overmounted - check if path is overmounted * @path: path to check * * Check if path is overmounted, i.e., if there's a mount on top of * @path->mnt with @path->dentry as mountpoint. * * Context: namespace_sem must be held at least shared. * MUST NOT be called under lock_mount_hash() (there one should just * call __lookup_mnt() and check if it returns NULL). * Return: If path is overmounted true is returned, false if not. */ static inline bool path_overmounted(const struct path *path) { unsigned seq = read_seqbegin(&mount_lock); bool no_child; rcu_read_lock(); no_child = !__lookup_mnt(path->mnt, path->dentry); rcu_read_unlock(); if (need_seqretry(&mount_lock, seq)) { read_seqlock_excl(&mount_lock); no_child = !__lookup_mnt(path->mnt, path->dentry); read_sequnlock_excl(&mount_lock); } return unlikely(!no_child); } /* * Check if there is a possibly empty chain of descent from p1 to p2. * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl). */ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) { while (p2 != p1 && mnt_has_parent(p2)) p2 = p2->mnt_parent; return p2 == p1; } /** * can_move_mount_beneath - check that we can mount beneath the top mount * @mnt_from: mount we are trying to move * @mnt_to: mount under which to mount * @mp: mountpoint of @mnt_to * * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. * - Ensure that nothing has been mounted on top of @mnt_from before we * grabbed @namespace_sem to avoid creating pointless shadow mounts. * - Prevent mounting beneath a mount if the propagation relationship * between the source mount, parent mount, and top mount would lead to * nonsensical mount trees. * * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ static int can_move_mount_beneath(const struct mount *mnt_from, const struct mount *mnt_to, const struct mountpoint *mp) { struct mount *parent_mnt_to = mnt_to->mnt_parent; if (IS_MNT_LOCKED(mnt_to)) return -EINVAL; /* Avoid creating shadow mounts during mount propagation. */ if (mnt_from->overmount) return -EINVAL; /* * Mounting beneath the rootfs only makes sense when the * semantics of pivot_root(".", ".") are used. */ if (&mnt_to->mnt == current->fs->root.mnt) return -EINVAL; if (parent_mnt_to == current->nsproxy->mnt_ns->root) return -EINVAL; if (mount_is_ancestor(mnt_to, mnt_from)) return -EINVAL; /* * If the parent mount propagates to the child mount this would * mean mounting @mnt_from on @mnt_to->mnt_parent and then * propagating a copy @c of @mnt_from on top of @mnt_to. This * defeats the whole purpose of mounting beneath another mount. */ if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) return -EINVAL; /* * If @mnt_to->mnt_parent propagates to @mnt_from this would * mean propagating a copy @c of @mnt_from on top of @mnt_from. * Afterwards @mnt_from would be mounted on top of * @mnt_to->mnt_parent and @mnt_to would be unmounted from * @mnt->mnt_parent and remounted on @mnt_from. But since @c is * already mounted on @mnt_from, @mnt_to would ultimately be * remounted on top of @c. Afterwards, @mnt_from would be * covered by a copy @c of @mnt_from and @c would be covered by * @mnt_from itself. This defeats the whole purpose of mounting * @mnt_from beneath @mnt_to. */ if (check_mnt(mnt_from) && propagation_would_overmount(parent_mnt_to, mnt_from, mp)) return -EINVAL; return 0; } /* may_use_mount() - check if a mount tree can be used * @mnt: vfsmount to be used * * This helper checks if the caller may use the mount tree starting * from @path->mnt. The caller may use the mount tree under the * following circumstances: * * (1) The caller is located in the mount namespace of the mount tree. * This also implies that the mount does not belong to an anonymous * mount namespace. * (2) The caller is trying to use a mount tree that belongs to an * anonymous mount namespace. * * For that to be safe, this helper enforces that the origin mount * namespace the anonymous mount namespace was created from is the * same as the caller's mount namespace by comparing the sequence * numbers. * * The ownership of a non-anonymous mount namespace such as the * caller's cannot change. * => We know that the caller's mount namespace is stable. * * If the origin sequence number of the anonymous mount namespace is * the same as the sequence number of the caller's mount namespace. * => The owning namespaces are the same. * * ==> The earlier capability check on the owning namespace of the * caller's mount namespace ensures that the caller has the * ability to use the mount tree. * * Returns true if the mount tree can be used, false otherwise. */ static inline bool may_use_mount(struct mount *mnt) { if (check_mnt(mnt)) return true; /* * Make sure that noone unmounted the target path or somehow * managed to get their hands on something purely kernel * internal. */ if (!is_mounted(&mnt->mnt)) return false; return check_anonymous_mnt(mnt); } static int do_move_mount(const struct path *old_path, const struct path *new_path, enum mnt_tree_flags_t flags) { struct mount *old = real_mount(old_path->mnt); int err; bool beneath = flags & MNT_TREE_BENEATH; if (!path_mounted(old_path)) return -EINVAL; if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) return -EINVAL; LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath); if (IS_ERR(mp.parent)) return PTR_ERR(mp.parent); if (check_mnt(old)) { /* if the source is in our namespace... */ /* ... it should be detachable from parent */ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) return -EINVAL; /* ... which should not be shared */ if (IS_MNT_SHARED(old->mnt_parent)) return -EINVAL; /* ... and the target should be in our namespace */ if (!check_mnt(mp.parent)) return -EINVAL; } else { /* * otherwise the source must be the root of some anon namespace. */ if (!anon_ns_root(old)) return -EINVAL; /* * Bail out early if the target is within the same namespace - * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ if (old->mnt_ns == mp.parent->mnt_ns) return -EINVAL; /* * Target should be either in our namespace or in an acceptable * anon namespace, sensu check_anonymous_mnt(). */ if (!may_use_mount(mp.parent)) return -EINVAL; } if (beneath) { struct mount *over = real_mount(new_path->mnt); if (mp.parent != over->mnt_parent) over = mp.parent->overmount; err = can_move_mount_beneath(old, over, mp.mp); if (err) return err; } /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old)) return -EINVAL; if (!check_for_nsfs_mounts(old)) return -ELOOP; if (mount_is_ancestor(old, mp.parent)) return -ELOOP; return attach_recursive_mnt(old, &mp); } static int do_move_mount_old(const struct path *path, const char *old_name) { struct path old_path __free(path_put) = {}; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; return do_move_mount(&old_path, path, 0); } /* * add a mount into a namespace's mount tree */ static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp, int mnt_flags) { struct mount *parent = mp->parent; if (IS_ERR(parent)) return PTR_ERR(parent); mnt_flags &= ~MNT_INTERNAL_FLAGS; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) return -EINVAL; } /* Refuse the same filesystem on the same mount point */ if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb && parent->mnt.mnt_root == mp->mp->m_dentry) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; return graft_tree(newmnt, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); /* * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint, unsigned int mnt_flags) { struct super_block *sb; struct vfsmount *mnt __free(mntput) = fc_mount(fc); int error; if (IS_ERR(mnt)) return PTR_ERR(mnt); sb = fc->root->d_sb; error = security_sb_kern_mount(sb); if (unlikely(error)) return error; if (unlikely(mount_too_revealing(sb, &mnt_flags))) { errorfcp(fc, "VFS", "Mount too revealing"); return -EPERM; } mnt_warn_timestamp_expiry(mountpoint, mnt); LOCK_MOUNT(mp, mountpoint); error = do_add_mount(real_mount(mnt), &mp, mnt_flags); if (!error) retain_and_null_ptr(mnt); // consumed on success return error; } /* * create a new mount for userspace and request it to be added into the * namespace's tree */ static int do_new_mount(const struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; const char *subtype = NULL; int err = 0; if (!fstype) return -EINVAL; type = get_fs_type(fstype); if (!type) return -ENODEV; if (type->fs_flags & FS_HAS_SUBTYPE) { subtype = strchr(fstype, '.'); if (subtype) { subtype++; if (!*subtype) { put_filesystem(type); return -EINVAL; } } } fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); if (IS_ERR(fc)) return PTR_ERR(fc); /* * Indicate to the filesystem that the mount request is coming * from the legacy mount system call. */ fc->oldapi = true; if (subtype) err = vfs_parse_fs_string(fc, "subtype", subtype); if (!err && name) err = vfs_parse_fs_string(fc, "source", name); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) err = -EPERM; if (!err) err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc); return err; } static void lock_mount_exact(const struct path *path, struct pinned_mountpoint *mp) { struct dentry *dentry = path->dentry; int err; inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) err = -ENOENT; else if (path_overmounted(path)) err = -EBUSY; else err = get_mountpoint(dentry, mp); if (unlikely(err)) { namespace_unlock(); inode_unlock(dentry->d_inode); mp->parent = ERR_PTR(err); } else { mp->parent = real_mount(path->mnt); } } int finish_automount(struct vfsmount *__m, const struct path *path) { struct vfsmount *m __free(mntput) = __m; struct mount *mnt; int err; if (!m) return 0; if (IS_ERR(m)) return PTR_ERR(m); mnt = real_mount(m); if (m->mnt_root == path->dentry) return -ELOOP; /* * we don't want to use LOCK_MOUNT() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ LOCK_MOUNT_EXACT(mp, path); if (mp.parent == ERR_PTR(-EBUSY)) return 0; err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE); if (likely(!err)) retain_and_null_ptr(m); return err; } /** * mnt_set_expiry - Put a mount on an expiration list * @mnt: The mount to list. * @expiry_list: The list to add the mount to. */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { guard(mount_locked_reader)(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); } EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any * mountpoints that aren't in use and haven't been touched since last we came * here */ void mark_mounts_for_expiry(struct list_head *mounts) { struct mount *mnt, *next; LIST_HEAD(graveyard); if (list_empty(mounts)) return; guard(namespace_excl)(); guard(mount_writer)(); /* extract from the expiration list every vfsmount that matches the * following criteria: * - already mounted * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { if (!is_mounted(&mnt->mnt)) continue; if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); /* * Ripoff of 'select_parent()' * * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ static int select_submounts(struct mount *parent, struct list_head *graveyard) { struct mount *this_parent = parent; struct list_head *next; int found = 0; repeat: next = this_parent->mnt_mounts.next; resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. */ if (!list_empty(&mnt->mnt_mounts)) { this_parent = mnt; goto repeat; } if (!propagate_mount_busy(mnt, 1)) { list_move_tail(&mnt->mnt_expire, graveyard); found++; } } /* * All done at this level ... ascend and resume the search */ if (this_parent != parent) { next = this_parent->mnt_child.next; this_parent = this_parent->mnt_parent; goto resume; } return found; } /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint * * mount_lock must be held for write */ static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } } static void *copy_mount_options(const void __user * data) { char *copy; unsigned left, offset; if (!data) return NULL; copy = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!copy) return ERR_PTR(-ENOMEM); left = copy_from_user(copy, data, PAGE_SIZE); /* * Not all architectures have an exact copy_from_user(). Resort to * byte at a time. */ offset = PAGE_SIZE - left; while (left) { char c; if (get_user(c, (const char __user *)data + offset)) break; copy[offset] = c; left--; offset++; } if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } return copy; } static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; int ret; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; if (flags & MS_NOUSER) return -EINVAL; ret = security_sb_mount(dev_name, path, type_page, flags, data_page); if (ret) return ret; if (!may_mount()) return -EPERM; if (flags & SB_MANDLOCK) warn_mandlock(); /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; if (flags & MS_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME)) == 0)) { mnt_flags &= ~MNT_ATIME_MASK; mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK; } sb_flags = flags & (SB_RDONLY | SB_SYNCHRONOUS | SB_MANDLOCK | SB_DIRSYNC | SB_SILENT | SB_POSIXACL | SB_LAZYTIME | SB_I_VERSION); if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) return do_remount(path, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return do_change_type(path, flags); if (flags & MS_MOVE) return do_move_mount_old(path, dev_name); return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name, data_page); } int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path __free(path_put) = {}; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; return path_mount(dev_name, &path, type_page, flags, data_page); } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); } static void dec_mnt_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); } static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; int ret; ucounts = inc_mnt_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } if (anon) ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); else ret = ns_common_init(new_ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } if (!anon) ns_tree_gen_id(&new_ns->ns); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; } __latent_entropy struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt __free(mntput) = NULL; struct vfsmount *pwdmnt __free(mntput) = NULL; struct mount *p, *q; struct mount *old; struct mount *new; int copy_flags; BUG_ON(!ns); if (likely(!(flags & CLONE_NEWNS))) { get_mnt_ns(ns); return ns; } old = ns->root; new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; guard(namespace_excl)(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { emptied_ns = new_ns; return ERR_CAST(new); } if (user_ns != ns->user_ns) { guard(mount_writer)(); lock_mnt_tree(new); } new_ns->root = new; /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ p = old; q = new; while (p) { mnt_add_to_ns(new_ns, q); new_ns->nr_mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); rootmnt = &p->mnt; } if (&p->mnt == new_fs->pwd.mnt) { new_fs->pwd.mnt = mntget(&q->mnt); pwdmnt = &p->mnt; } } p = next_mnt(p, old); q = next_mnt(q, new); if (!q) break; // an mntns binding we'd skipped? while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } ns_tree_add_raw(new_ns); return new_ns; } struct dentry *mount_subtree(struct vfsmount *m, const char *name) { struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) { mntput(m); return ERR_CAST(ns); } ns->root = mnt; ns->nr_mounts++; mnt_add_to_ns(ns, mnt); err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); if (err) return ERR_PTR(err); /* trade a vfsmount reference for active sb one */ s = path.mnt->mnt_sb; atomic_inc(&s->s_active); mntput(path.mnt); /* lock the sucker */ down_write(&s->s_umount); /* ... and return the root of (sub)tree on it */ return path.dentry; } EXPORT_SYMBOL(mount_subtree); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; char *kernel_dev; void *options; kernel_type = copy_mount_string(type); ret = PTR_ERR(kernel_type); if (IS_ERR(kernel_type)) goto out_type; kernel_dev = copy_mount_string(dev_name); ret = PTR_ERR(kernel_dev); if (IS_ERR(kernel_dev)) goto out_dev; options = copy_mount_options(data); ret = PTR_ERR(options); if (IS_ERR(options)) goto out_data; ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); kfree(options); out_data: kfree(kernel_dev); out_dev: kfree(kernel_type); out_type: return ret; } #define FSMOUNT_VALID_FLAGS \ (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \ MOUNT_ATTR_NOSYMFOLLOW) #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) #define MOUNT_SETATTR_PROPAGATION_FLAGS \ (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) { unsigned int mnt_flags = 0; if (attr_flags & MOUNT_ATTR_RDONLY) mnt_flags |= MNT_READONLY; if (attr_flags & MOUNT_ATTR_NOSUID) mnt_flags |= MNT_NOSUID; if (attr_flags & MOUNT_ATTR_NODEV) mnt_flags |= MNT_NODEV; if (attr_flags & MOUNT_ATTR_NOEXEC) mnt_flags |= MNT_NOEXEC; if (attr_flags & MOUNT_ATTR_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; return mnt_flags; } /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. */ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, attr_flags) { struct mnt_namespace *ns; struct fs_context *fc; struct file *file; struct path newmount; struct mount *mnt; unsigned int mnt_flags = 0; long ret; if (!may_mount()) return -EPERM; if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; mnt_flags = attr_flags_to_mnt_flags(attr_flags); switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: break; case MOUNT_ATTR_NOATIME: mnt_flags |= MNT_NOATIME; break; case MOUNT_ATTR_RELATIME: mnt_flags |= MNT_RELATIME; break; default: return -EINVAL; } CLASS(fd, f)(fs_fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &fscontext_fops) return -EINVAL; fc = fd_file(f)->private_data; ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret < 0) return ret; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; if (!fc->root) goto err_unlock; ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { errorfcp(fc, "VFS", "Mount too revealing"); goto err_unlock; } ret = -EBUSY; if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) goto err_unlock; if (fc->sb_flags & SB_MANDLOCK) warn_mandlock(); newmount.mnt = vfs_create_mount(fc); if (IS_ERR(newmount.mnt)) { ret = PTR_ERR(newmount.mnt); goto err_unlock; } newmount.dentry = dget(fc->root); newmount.mnt->mnt_flags = mnt_flags; /* We've done the mount bit - now move the file context into more or * less the same state as if we'd done an fspick(). We don't want to * do any memory allocation or anything like that at this point as we * don't want to have to handle any errors incurred. */ vfs_clean_context(fc); ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); if (IS_ERR(ns)) { ret = PTR_ERR(ns); goto err_path; } mnt = real_mount(newmount.mnt); ns->root = mnt; ns->nr_mounts = 1; mnt_add_to_ns(ns, mnt); mntget(newmount.mnt); /* Attach to an apparent O_PATH fd with a note that we need to unmount * it, not just simply put it. */ file = dentry_open(&newmount, O_PATH, fc->cred); if (IS_ERR(file)) { dissolve_on_fput(newmount.mnt); ret = PTR_ERR(file); goto err_path; } file->f_mode |= FMODE_NEED_UNMOUNT; ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); if (ret >= 0) fd_install(ret, file); else fput(file); err_path: path_put(&newmount); err_unlock: mutex_unlock(&fc->uapi_mutex); return ret; } static inline int vfs_move_mount(const struct path *from_path, const struct path *to_path, enum mnt_tree_flags_t mflags) { int ret; ret = security_move_mount(from_path, to_path); if (ret) return ret; if (mflags & MNT_TREE_PROPAGATION) return do_set_group(from_path, to_path); return do_move_mount(from_path, to_path, mflags); } /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy * a mount subtree. * * Note the flags value is a combination of MOVE_MOUNT_* flags. */ SYSCALL_DEFINE5(move_mount, int, from_dfd, const char __user *, from_pathname, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { struct path to_path __free(path_put) = {}; struct path from_path __free(path_put) = {}; struct filename *to_name __free(putname) = NULL; struct filename *from_name __free(putname) = NULL; unsigned int lflags, uflags; enum mnt_tree_flags_t mflags = 0; int ret = 0; if (!may_mount()) return -EPERM; if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) == (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) return -EINVAL; if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; uflags = 0; if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; to_name = getname_maybe_null(to_pathname, uflags); if (IS_ERR(to_name)) return PTR_ERR(to_name); if (!to_name && to_dfd >= 0) { CLASS(fd_raw, f_to)(to_dfd); if (fd_empty(f_to)) return -EBADF; to_path = fd_file(f_to)->f_path; path_get(&to_path); } else { lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); if (ret) return ret; } uflags = 0; if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; from_name = getname_maybe_null(from_pathname, uflags); if (IS_ERR(from_name)) return PTR_ERR(from_name); if (!from_name && from_dfd >= 0) { CLASS(fd_raw, f_from)(from_dfd); if (fd_empty(f_from)) return -EBADF; return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); } lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); if (ret) return ret; return vfs_move_mount(&from_path, &to_path, mflags); } /* * Return true if path is reachable from root * * locks: mount_locked_reader || namespace_shared && is_mounted(mnt) */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) { while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { dentry = mnt->mnt_mountpoint; mnt = mnt->mnt_parent; } return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } bool path_is_under(const struct path *path1, const struct path *path2) { guard(mount_locked_reader)(); return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); } EXPORT_SYMBOL(path_is_under); /* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets * root/cwd of all processes which had them on the current root to new_root. * * Restrictions: * The new_root and put_old must be directories, and must not be on the * same file system as the current process root. The put_old must be * underneath new_root, i.e. adding a non-zero number of /.. to the string * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: * - we don't move root/cwd if they are not at the root (reason: if something * cared enough to change them, it's probably wrong to force them elsewhere) * - it's okay to pick a root that isn't the root of a file system, e.g. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root * first. */ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { struct path new __free(path_put) = {}; struct path old __free(path_put) = {}; struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; int error; if (!may_mount()) return -EPERM; error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) return error; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) return error; error = security_sb_pivotroot(&old, &new); if (error) return error; get_fs_root(current->fs, &root); LOCK_MOUNT(old_mp, &old); old_mnt = old_mp.parent; if (IS_ERR(old_mnt)) return PTR_ERR(old_mnt); new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) return -EINVAL; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) return -EINVAL; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) return -EINVAL; if (d_unlinked(new.dentry)) return -ENOENT; if (new_mnt == root_mnt || old_mnt == root_mnt) return -EBUSY; /* loop, on the same file system */ if (!path_mounted(&root)) return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) return -EINVAL; /* absolute root */ if (!path_mounted(&new)) return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new)) return -EINVAL; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) return -EINVAL; lock_mount_hash(); umount_mnt(new_mnt); if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount new_root on / */ attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp); umount_mnt(root_mnt); /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp.mp); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); unlock_mount_hash(); mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); return 0; } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) { unsigned int flags = mnt->mnt.mnt_flags; /* flags to clear */ flags &= ~kattr->attr_clr; /* flags to raise */ flags |= kattr->attr_set; return flags; } static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct vfsmount *m = &mnt->mnt; struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; if (!kattr->mnt_idmap) return 0; /* * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ if (kattr->mnt_userns == m->mnt_sb->s_user_ns) return -EINVAL; /* * We only allow an mount to change it's idmapping if it has * never been accessible to userspace. */ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; /* The filesystem has turned off idmapped mounts. */ if (m->mnt_sb->s_iflags & SB_I_NOIDMAP) return -EINVAL; /* We're not controlling the superblock. */ if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ if (!is_anon_ns(mnt->mnt_ns)) return -EINVAL; return 0; } /** * mnt_allow_writers() - check whether the attribute change allows writers * @kattr: the new mount attributes * @mnt: the mount to which @kattr will be applied * * Check whether thew new mount attributes in @kattr allow concurrent writers. * * Return: true if writers need to be held, false if not */ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, const struct mount *mnt) { return (!(kattr->attr_set & MNT_READONLY) || (mnt->mnt.mnt_flags & MNT_READONLY)) && !kattr->mnt_idmap; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) { struct mount *m; int err; for (m = mnt; m; m = next_mnt(m, mnt)) { if (!can_change_locked_flags(m, recalc_flags(kattr, m))) { err = -EPERM; break; } err = can_idmap_mount(kattr, m); if (err) break; if (!mnt_allow_writers(kattr, m)) { err = mnt_hold_writers(m); if (err) { m = next_mnt(m, mnt); break; } } if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) return 0; } if (err) { /* undo all mnt_hold_writers() we'd done */ for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt)) mnt_unhold_writers(p); } return err; } static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct mnt_idmap *old_idmap; if (!kattr->mnt_idmap) return; old_idmap = mnt_idmap(&mnt->mnt); /* Pairs with smp_load_acquire() in mnt_idmap(). */ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); mnt_idmap_put(old_idmap); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) { struct mount *m; for (m = mnt; m; m = next_mnt(m, mnt)) { unsigned int flags; do_idmap_mount(kattr, m); flags = recalc_flags(kattr, m); WRITE_ONCE(m->mnt.mnt_flags, flags); /* If we had to hold writers unblock them. */ mnt_unhold_writers(m); if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) break; } touch_mnt_namespace(mnt->mnt_ns); } static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr) { struct mount *mnt = real_mount(path->mnt); int err = 0; if (!path_mounted(path)) return -EINVAL; if (kattr->mnt_userns) { struct mnt_idmap *mnt_idmap; mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns); if (IS_ERR(mnt_idmap)) return PTR_ERR(mnt_idmap); kattr->mnt_idmap = mnt_idmap; } if (kattr->propagation) { /* * Only take namespace_lock() if we're actually changing * propagation. */ namespace_lock(); if (kattr->propagation == MS_SHARED) { err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); if (err) { namespace_unlock(); return err; } } } err = -EINVAL; lock_mount_hash(); if (!anon_ns_root(mnt) && !check_mnt(mnt)) goto out; /* * First, we get the mount tree in a shape where we can change mount * properties without failure. If we succeeded to do so we commit all * changes and if we failed we clean up. */ err = mount_setattr_prepare(kattr, mnt); if (!err) mount_setattr_commit(kattr, mnt); out: unlock_mount_hash(); if (kattr->propagation) { if (err) cleanup_group_ids(mnt, NULL); namespace_unlock(); } return err; } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr) { struct ns_common *ns; struct user_namespace *mnt_userns; if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; if (attr->attr_clr & MOUNT_ATTR_IDMAP) { /* * We can only remove an idmapping if it's never been * exposed to userspace. */ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) return -EINVAL; /* * Removal of idmappings is equivalent to setting * nop_mnt_idmap. */ if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { kattr->mnt_idmap = &nop_mnt_idmap; return 0; } } if (attr->userns_fd > INT_MAX) return -EINVAL; CLASS(fd, f)(attr->userns_fd); if (fd_empty(f)) return -EBADF; if (!proc_ns_file(fd_file(f))) return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* * The initial idmapping cannot be used to create an idmapped * mount. We use the initial idmapping as an indicator of a mount * that is not idmapped. It can simply be passed into helpers that * are aware of idmapped mounts as a convenient shortcut. A user * can just create a dedicated identity mapping to achieve the same * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); if (mnt_userns == &init_user_ns) return -EPERM; /* We're not controlling the target namespace. */ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) return -EPERM; kattr->mnt_userns = get_user_ns(mnt_userns); return 0; } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr) { if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) return -EINVAL; kattr->propagation = attr->propagation; if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) return -EINVAL; kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); /* * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, * users wanting to transition to a different atime setting cannot * simply specify the atime setting in @attr_set, but must also * specify MOUNT_ATTR__ATIME in the @attr_clr field. * So ensure that MOUNT_ATTR__ATIME can't be partially set in * @attr_clr and that @attr_set can't have any atime bits set if * MOUNT_ATTR__ATIME isn't set in @attr_clr. */ if (attr->attr_clr & MOUNT_ATTR__ATIME) { if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) return -EINVAL; /* * Clear all previous time settings as they are mutually * exclusive. */ kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; switch (attr->attr_set & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_RELATIME: kattr->attr_set |= MNT_RELATIME; break; case MOUNT_ATTR_NOATIME: kattr->attr_set |= MNT_NOATIME; break; case MOUNT_ATTR_STRICTATIME: break; default: return -EINVAL; } } else { if (attr->attr_set & MOUNT_ATTR__ATIME) return -EINVAL; } return build_mount_idmapped(attr, usize, kattr); } static void finish_mount_kattr(struct mount_kattr *kattr) { if (kattr->mnt_userns) { put_user_ns(kattr->mnt_userns); kattr->mnt_userns = NULL; } if (kattr->mnt_idmap) mnt_idmap_put(kattr->mnt_idmap); } static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize, struct mount_kattr *kattr) { int ret; struct mount_attr attr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) return -EINVAL; if (!may_mount()) return -EPERM; ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); if (ret) return ret; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && attr.attr_clr == 0 && attr.propagation == 0) return 0; /* Tell caller to not bother. */ ret = build_mount_kattr(&attr, usize, kattr); if (ret < 0) return ret; return 1; } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, unsigned int, flags, struct mount_attr __user *, uattr, size_t, usize) { int err; struct path target; struct mount_kattr kattr; unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; if (flags & ~(AT_EMPTY_PATH | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) return -EINVAL; if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; kattr = (struct mount_kattr) { .lookup_flags = lookup_flags, }; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; err = wants_mount_setattr(uattr, usize, &kattr); if (err <= 0) return err; err = user_path_at(dfd, path, kattr.lookup_flags, &target); if (!err) { err = do_mount_setattr(&target, &kattr); path_put(&target); } finish_mount_kattr(&kattr); return err; } SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, unsigned, flags, struct mount_attr __user *, uattr, size_t, usize) { struct file __free(fput) *file = NULL; int fd; if (!uattr && usize) return -EINVAL; file = vfs_open_tree(dfd, filename, flags); if (IS_ERR(file)) return PTR_ERR(file); if (uattr) { int ret; struct mount_kattr kattr = {}; if (flags & OPEN_TREE_CLONE) kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); if (ret > 0) { ret = do_mount_setattr(&file->f_path, &kattr); finish_mount_kattr(&kattr); } if (ret) return ret; } fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; fd_install(fd, no_free_ptr(file)); return fd; } int show_path(struct seq_file *m, struct dentry *root) { if (root->d_sb->s_op->show_path) return root->d_sb->s_op->show_path(m, root); seq_dentry(m, root, " \t\n\\"); return 0; } static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) { struct mount *mnt = mnt_find_id_at(ns, id); if (!mnt || mnt->mnt_id_unique != id) return NULL; return &mnt->mnt; } struct kstatmount { struct statmount __user *buf; size_t bufsize; struct vfsmount *mnt; struct mnt_idmap *idmap; u64 mask; struct path root; struct seq_file seq; /* Must be last --ends in a flexible-array member. */ struct statmount sm; }; static u64 mnt_to_attr_flags(struct vfsmount *mnt) { unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); u64 attr_flags = 0; if (mnt_flags & MNT_READONLY) attr_flags |= MOUNT_ATTR_RDONLY; if (mnt_flags & MNT_NOSUID) attr_flags |= MOUNT_ATTR_NOSUID; if (mnt_flags & MNT_NODEV) attr_flags |= MOUNT_ATTR_NODEV; if (mnt_flags & MNT_NOEXEC) attr_flags |= MOUNT_ATTR_NOEXEC; if (mnt_flags & MNT_NODIRATIME) attr_flags |= MOUNT_ATTR_NODIRATIME; if (mnt_flags & MNT_NOSYMFOLLOW) attr_flags |= MOUNT_ATTR_NOSYMFOLLOW; if (mnt_flags & MNT_NOATIME) attr_flags |= MOUNT_ATTR_NOATIME; else if (mnt_flags & MNT_RELATIME) attr_flags |= MOUNT_ATTR_RELATIME; else attr_flags |= MOUNT_ATTR_STRICTATIME; if (is_idmapped_mnt(mnt)) attr_flags |= MOUNT_ATTR_IDMAP; return attr_flags; } static u64 mnt_to_propagation_flags(struct mount *m) { u64 propagation = 0; if (IS_MNT_SHARED(m)) propagation |= MS_SHARED; if (IS_MNT_SLAVE(m)) propagation |= MS_SLAVE; if (IS_MNT_UNBINDABLE(m)) propagation |= MS_UNBINDABLE; if (!propagation) propagation |= MS_PRIVATE; return propagation; } static void statmount_sb_basic(struct kstatmount *s) { struct super_block *sb = s->mnt->mnt_sb; s->sm.mask |= STATMOUNT_SB_BASIC; s->sm.sb_dev_major = MAJOR(sb->s_dev); s->sm.sb_dev_minor = MINOR(sb->s_dev); s->sm.sb_magic = sb->s_magic; s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME); } static void statmount_mnt_basic(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); s->sm.mask |= STATMOUNT_MNT_BASIC; s->sm.mnt_id = m->mnt_id_unique; s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique; s->sm.mnt_id_old = m->mnt_id; s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); s->sm.mnt_propagation = mnt_to_propagation_flags(m); s->sm.mnt_peer_group = m->mnt_group_id; s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; } static void statmount_propagate_from(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); s->sm.mask |= STATMOUNT_PROPAGATE_FROM; if (IS_MNT_SLAVE(m)) s->sm.propagate_from = get_dominating_id(m, &current->fs->root); } static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq) { int ret; size_t start = seq->count; ret = show_path(seq, s->mnt->mnt_root); if (ret) return ret; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; /* * Unescape the result. It would be better if supplied string was not * escaped in the first place, but that's a pretty invasive change. */ seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); return 0; } static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err; err = seq_path_root(seq, &mnt_path, &s->root, ""); return err == SEQ_SKIP ? 0 : err; } static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; seq_puts(seq, sb->s_type->name); return 0; } static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; if (sb->s_subtype) seq_puts(seq, sb->s_subtype); } static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; struct mount *r = real_mount(s->mnt); if (sb->s_op->show_devname) { size_t start = seq->count; int ret; ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); if (ret) return ret; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; /* Unescape the result */ seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); } else { seq_puts(seq, r->mnt_devname); } return 0; } static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; s->sm.mnt_ns_id = ns->ns.ns_id; } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; err = security_sb_show_options(seq, sb); if (err) return err; if (sb->s_op->show_options) { err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; } if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; if (seq->count == start) return 0; /* skip leading comma */ memmove(seq->buf + start, seq->buf + start + 1, seq->count - start - 1); seq->count--; return 0; } static inline int statmount_opt_process(struct seq_file *seq, size_t start) { char *buf_end, *opt_end, *src, *dst; int count = 0; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; buf_end = seq->buf + seq->count; dst = seq->buf + start; src = dst + 1; /* skip initial comma */ if (src >= buf_end) { seq->count = start; return 0; } *buf_end = '\0'; for (; src < buf_end; src = opt_end + 1) { opt_end = strchrnul(src, ','); *opt_end = '\0'; dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1; if (WARN_ON_ONCE(++count == INT_MAX)) return -EOVERFLOW; } seq->count = dst - 1 - seq->buf; return count; } static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; if (!sb->s_op->show_options) return 0; err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; err = statmount_opt_process(seq, start); if (err < 0) return err; s->sm.opt_num = err; return 0; } static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; err = security_sb_show_options(seq, sb); if (err) return err; err = statmount_opt_process(seq, start); if (err < 0) return err; s->sm.opt_sec_num = err; return 0; } static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) { int ret; ret = statmount_mnt_idmap(s->idmap, seq, true); if (ret < 0) return ret; s->sm.mnt_uidmap_num = ret; /* * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid * mappings. This allows userspace to distinguish between a * non-idmapped mount and an idmapped mount where none of the * individual mappings are valid in the caller's idmapping. */ if (is_valid_mnt_idmap(s->idmap)) s->sm.mask |= STATMOUNT_MNT_UIDMAP; return 0; } static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) { int ret; ret = statmount_mnt_idmap(s->idmap, seq, false); if (ret < 0) return ret; s->sm.mnt_gidmap_num = ret; /* * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid * mappings. This allows userspace to distinguish between a * non-idmapped mount and an idmapped mount where none of the * individual mappings are valid in the caller's idmapping. */ if (is_valid_mnt_idmap(s->idmap)) s->sm.mask |= STATMOUNT_MNT_GIDMAP; return 0; } static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; u32 start, *offp; /* Reserve an empty string at the beginning for any unset offsets */ if (!seq->count) seq_putc(seq, 0); start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: offp = &sm->fs_type; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: offp = &sm->mnt_root; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: offp = &sm->mnt_point; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: offp = &sm->mnt_opts; ret = statmount_mnt_opts(s, seq); break; case STATMOUNT_OPT_ARRAY: offp = &sm->opt_array; ret = statmount_opt_array(s, seq); break; case STATMOUNT_OPT_SEC_ARRAY: offp = &sm->opt_sec_array; ret = statmount_opt_sec_array(s, seq); break; case STATMOUNT_FS_SUBTYPE: offp = &sm->fs_subtype; statmount_fs_subtype(s, seq); break; case STATMOUNT_SB_SOURCE: offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; case STATMOUNT_MNT_UIDMAP: sm->mnt_uidmap = start; ret = statmount_mnt_uidmap(s, seq); break; case STATMOUNT_MNT_GIDMAP: sm->mnt_gidmap = start; ret = statmount_mnt_gidmap(s, seq); break; default: WARN_ON_ONCE(true); return -EINVAL; } /* * If nothing was emitted, return to avoid setting the flag * and terminating the buffer. */ if (seq->count == start) return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) return -EOVERFLOW; /* signal a retry */ if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; if (ret) return ret; seq->buf[seq->count++] = '\0'; sm->mask |= flag; *offp = start; return 0; } static int copy_statmount_to_user(struct kstatmount *s) { struct statmount *sm = &s->sm; struct seq_file *seq = &s->seq; char __user *str = ((char __user *)s->buf) + sizeof(*sm); size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm)); if (seq->count && copy_to_user(str, seq->buf, seq->count)) return -EFAULT; /* Return the number of bytes copied to the buffer */ sm->size = copysize + seq->count; if (copy_to_user(s->buf, sm, copysize)) return -EFAULT; return 0; } static struct mount *listmnt_next(struct mount *curr, bool reverse) { struct rb_node *node; if (reverse) node = rb_prev(&curr->mnt_node); else node = rb_next(&curr->mnt_node); return node_to_mount(node); } static int grab_requested_root(struct mnt_namespace *ns, struct path *root) { struct mount *first, *child; rwsem_assert_held(&namespace_sem); /* We're looking at our own ns, just use get_fs_root. */ if (ns == current->nsproxy->mnt_ns) { get_fs_root(current->fs, root); return 0; } /* * We have to find the first mount in our ns and use that, however it * may not exist, so handle that properly. */ if (mnt_ns_empty(ns)) return -ENOENT; first = child = ns->root; for (;;) { child = listmnt_next(child, false); if (!child) return -ENOENT; if (child->mnt_parent == first) break; } root->mnt = mntget(&child->mnt); root->dentry = dget(root->mnt->mnt_root); return 0; } /* This must be updated whenever a new flag is added */ #define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ STATMOUNT_MNT_BASIC | \ STATMOUNT_PROPAGATE_FROM | \ STATMOUNT_MNT_ROOT | \ STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | \ STATMOUNT_MNT_NS_ID | \ STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | \ STATMOUNT_SB_SOURCE | \ STATMOUNT_OPT_ARRAY | \ STATMOUNT_OPT_SEC_ARRAY | \ STATMOUNT_SUPPORTED_MASK | \ STATMOUNT_MNT_UIDMAP | \ STATMOUNT_MNT_GIDMAP) /* locks: namespace_shared */ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { struct mount *m; int err; /* Has the namespace already been emptied? */ if (mnt_ns_id && mnt_ns_empty(ns)) return -ENOENT; s->mnt = lookup_mnt_in_ns(mnt_id, ns); if (!s->mnt) return -ENOENT; err = grab_requested_root(ns, &s->root); if (err) return err; /* * Don't trigger audit denials. We just want to determine what * mounts to show users. */ m = real_mount(s->mnt); if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; err = security_sb_statfs(s->mnt->mnt_root); if (err) return err; /* * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap * can change concurrently as we only hold the read-side of the * namespace semaphore and mount properties may change with only * the mount lock held. * * We could sample the mount lock sequence counter to detect * those changes and retry. But it's not worth it. Worst that * happens is that the mnt->mnt_idmap pointer is already changed * while mnt->mnt_flags isn't or vica versa. So what. * * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved * via READ_ONCE()/WRITE_ONCE() and guard against theoretical * torn read/write. That's all we care about right now. */ s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_MNT_BASIC) statmount_mnt_basic(s); if (s->mask & STATMOUNT_SB_BASIC) statmount_sb_basic(s); if (s->mask & STATMOUNT_PROPAGATE_FROM) statmount_propagate_from(s); if (s->mask & STATMOUNT_FS_TYPE) err = statmount_string(s, STATMOUNT_FS_TYPE); if (!err && s->mask & STATMOUNT_MNT_ROOT) err = statmount_string(s, STATMOUNT_MNT_ROOT); if (!err && s->mask & STATMOUNT_MNT_POINT) err = statmount_string(s, STATMOUNT_MNT_POINT); if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); if (!err && s->mask & STATMOUNT_OPT_ARRAY) err = statmount_string(s, STATMOUNT_OPT_ARRAY); if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); if (!err && s->mask & STATMOUNT_FS_SUBTYPE) err = statmount_string(s, STATMOUNT_FS_SUBTYPE); if (!err && s->mask & STATMOUNT_SB_SOURCE) err = statmount_string(s, STATMOUNT_SB_SOURCE); if (!err && s->mask & STATMOUNT_MNT_UIDMAP) err = statmount_string(s, STATMOUNT_MNT_UIDMAP); if (!err && s->mask & STATMOUNT_MNT_GIDMAP) err = statmount_string(s, STATMOUNT_MNT_GIDMAP); if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { s->sm.mask |= STATMOUNT_SUPPORTED_MASK; s->sm.supported_mask = STATMOUNT_SUPPORTED; } if (err) return err; /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); return 0; } static inline bool retry_statmount(const long ret, size_t *seq_size) { if (likely(ret != -EAGAIN)) return false; if (unlikely(check_mul_overflow(*seq_size, 2, seq_size))) return false; if (unlikely(*seq_size > MAX_RW_COUNT)) return false; return true; } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, size_t seq_size) { if (!access_ok(buf, bufsize)) return -EFAULT; memset(ks, 0, sizeof(*ks)); ks->mask = kreq->param; ks->buf = buf; ks->bufsize = bufsize; if (ks->mask & STATMOUNT_STRING_REQ) { if (bufsize == sizeof(ks->sm)) return -EOVERFLOW; ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); if (!ks->seq.buf) return -ENOMEM; ks->seq.size = seq_size; } return 0; } static int copy_mnt_id_req(const struct mnt_id_req __user *req, struct mnt_id_req *kreq) { int ret; size_t usize; BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1); ret = get_user(usize, &req->size); if (ret) return -EFAULT; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MNT_ID_REQ_SIZE_VER0)) return -EINVAL; memset(kreq, 0, sizeof(*kreq)); ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; if (kreq->spare != 0) return -EINVAL; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) return -EINVAL; return 0; } /* * If the user requested a specific mount namespace id, look that up and return * that, or if not simply grab a passive reference on our mount namespace and * return that. */ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq) { struct mnt_namespace *mnt_ns; if (kreq->mnt_ns_id && kreq->spare) return ERR_PTR(-EINVAL); if (kreq->mnt_ns_id) return lookup_mnt_ns(kreq->mnt_ns_id); if (kreq->spare) { struct ns_common *ns; CLASS(fd, f)(kreq->spare); if (fd_empty(f)) return ERR_PTR(-EBADF); if (!proc_ns_file(fd_file(f))) return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); } else { mnt_ns = current->nsproxy->mnt_ns; } refcount_inc(&mnt_ns->passive); return mnt_ns; } SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) { struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct kstatmount *ks __free(kfree) = NULL; struct mnt_id_req kreq; /* We currently support retrieval of 3 strings. */ size_t seq_size = 3 * PATH_MAX; int ret; if (flags) return -EINVAL; ret = copy_mnt_id_req(req, &kreq); if (ret) return ret; ns = grab_requested_mnt_ns(&kreq); if (!ns) return -ENOENT; if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); if (!ks) return -ENOMEM; retry: ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size); if (ret) return ret; scoped_guard(namespace_shared) ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); if (!ret) ret = copy_statmount_to_user(ks); kvfree(ks->seq.buf); path_put(&ks->root); if (retry_statmount(ret, &seq_size)) goto retry; return ret; } struct klistmount { u64 last_mnt_id; u64 mnt_parent_id; u64 *kmnt_ids; u32 nr_mnt_ids; struct mnt_namespace *ns; struct path root; }; /* locks: namespace_shared */ static ssize_t do_listmount(struct klistmount *kls, bool reverse) { struct mnt_namespace *ns = kls->ns; u64 mnt_parent_id = kls->mnt_parent_id; u64 last_mnt_id = kls->last_mnt_id; u64 *mnt_ids = kls->kmnt_ids; size_t nr_mnt_ids = kls->nr_mnt_ids; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); ret = grab_requested_root(ns, &kls->root); if (ret) return ret; if (mnt_parent_id == LSMT_ROOT) { orig = kls->root; } else { orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) return -ENOENT; orig.dentry = orig.mnt->mnt_root; } /* * Don't trigger audit denials. We just want to determine what * mounts to show users. */ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; ret = security_sb_statfs(orig.dentry); if (ret) return ret; if (!last_mnt_id) { if (reverse) first = node_to_mount(ns->mnt_last_node); else first = node_to_mount(ns->mnt_first_node); } else { if (reverse) first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); else first = mnt_find_id_at(ns, last_mnt_id + 1); } for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) { if (r->mnt_id_unique == mnt_parent_id) continue; if (!is_path_reachable(r, r->mnt.mnt_root, &orig)) continue; *mnt_ids = r->mnt_id_unique; mnt_ids++; nr_mnt_ids--; ret++; } return ret; } static void __free_klistmount_free(const struct klistmount *kls) { path_put(&kls->root); kvfree(kls->kmnt_ids); mnt_ns_release(kls->ns); } static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, size_t nr_mnt_ids) { u64 last_mnt_id = kreq->param; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) return -EINVAL; kls->last_mnt_id = last_mnt_id; kls->nr_mnt_ids = nr_mnt_ids; kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids), GFP_KERNEL_ACCOUNT); if (!kls->kmnt_ids) return -ENOMEM; kls->ns = grab_requested_mnt_ns(kreq); if (!kls->ns) return -ENOENT; kls->mnt_parent_id = kreq->mnt_id; return 0; } SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { struct klistmount kls __free(klistmount_free) = {}; const size_t maxcount = 1000000; struct mnt_id_req kreq; ssize_t ret; if (flags & ~LISTMOUNT_REVERSE) return -EINVAL; /* * If the mount namespace really has more than 1 million mounts the * caller must iterate over the mount namespace (and reconsider their * system design...). */ if (unlikely(nr_mnt_ids > maxcount)) return -EOVERFLOW; if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) return -EFAULT; ret = copy_mnt_id_req(req, &kreq); if (ret) return ret; ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids); if (ret) return ret; if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; /* * We only need to guard against mount topology changes as * listmount() doesn't care about any mount properties. */ scoped_guard(namespace_shared) ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids))) return -EFAULT; return ret; } struct mnt_namespace init_mnt_ns = { .ns.inum = ns_init_inum(&init_mnt_ns), .ns.ops = &mntns_operations, .user_ns = &init_user_ns, .ns.__ns_ref = REFCOUNT_INIT(1), .ns.ns_type = ns_common_type(&init_mnt_ns), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), }; static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; struct path root; mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); m = real_mount(mnt); init_mnt_ns.root = m; init_mnt_ns.nr_mounts = 1; mnt_add_to_ns(&init_mnt_ns, m); init_task.nsproxy->mnt_ns = &init_mnt_ns; get_mnt_ns(&init_mnt_ns); root.mnt = mnt; root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) { int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); kernfs_init(); err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", __func__, err); fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) printk(KERN_WARNING "%s: kobj create error\n", __func__); shmem_init(); init_rootfs(); init_mount_tree(); } void put_mnt_ns(struct mnt_namespace *ns) { if (!ns_ref_put(ns)) return; guard(namespace_excl)(); emptied_ns = ns; guard(mount_writer)(); umount_tree(ns->root, 0); } struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until * we unmount before file sys is unregistered */ real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; } return mnt; } EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR(mnt)) { mnt_make_shortterm(mnt); synchronize_rcu(); /* yecchhh... */ mntput(mnt); } } EXPORT_SYMBOL(kern_unmount); void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) { unsigned int i; for (i = 0; i < num; i++) mnt_make_shortterm(mnt[i]); synchronize_rcu_expedited(); for (i = 0; i < num; i++) mntput(mnt[i]); } EXPORT_SYMBOL(kern_unmount_array); bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } bool current_chrooted(void) { /* Does the current process have a non-standard root */ struct path fs_root __free(path_put) = {}; struct mount *root; get_fs_root(current->fs, &fs_root); /* Find the namespace root */ guard(mount_locked_reader)(); root = topmost_overmount(current->nsproxy->mnt_ns->root); return fs_root.mnt != &root->mnt || !path_mounted(&fs_root); } static bool mnt_already_visible(struct mnt_namespace *ns, const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; struct mount *mnt, *n; guard(namespace_shared)(); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory * is not the root directory of the filesystem. */ if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; /* Don't miss readonly hidden in the superblock flags */ if (sb_rdonly(mnt->mnt.mnt_sb)) mnt_flags |= MNT_LOCK_READONLY; /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; /* This mount is not fully visible if there are any * locked child mounts that cover anything except for * empty directories. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanently empty? */ if (!is_empty_dir_inode(inode)) goto next; } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); return true; next: ; } return false; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; unsigned long s_iflags; if (ns->user_ns == &init_user_ns) return false; /* Can this filesystem be too revealing? */ s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); return true; } return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) { /* * Foreign mounts (accessed via fchdir or through /proc * symlinks) are always treated as if they are nosuid. This * prevents namespaces from trusting potentially unsafe * suid/sgid bits, file caps, or security labels that originate * in other namespaces. */ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && current_in_userns(mnt->mnt_sb->s_user_ns); } static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = &nsproxy->mnt_ns->ns; get_mnt_ns(to_mnt_ns(ns)); } task_unlock(task); return ns; } static void mntns_put(struct ns_common *ns) { put_mnt_ns(to_mnt_ns(ns)); } static int mntns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct fs_struct *fs = nsset->fs; struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; struct user_namespace *user_ns = nsset->cred->user_ns; struct path root; int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(user_ns, CAP_SYS_CHROOT) || !ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; if (is_anon_ns(mnt_ns)) return -EINVAL; if (fs->users != 1) return -EINVAL; get_mnt_ns(mnt_ns); old_mnt_ns = nsproxy->mnt_ns; nsproxy->mnt_ns = mnt_ns; /* Find the root */ err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, "/", LOOKUP_DOWN, &root); if (err) { /* revert to old namespace */ nsproxy->mnt_ns = old_mnt_ns; put_mnt_ns(mnt_ns); return err; } put_mnt_ns(old_mnt_ns); /* Update the pwd and root */ set_fs_pwd(fs, &root); set_fs_root(fs, &root); path_put(&root); return 0; } static struct user_namespace *mntns_owner(struct ns_common *ns) { return to_mnt_ns(ns)->user_ns; } const struct proc_ns_operations mntns_operations = { .name = "mnt", .get = mntns_get, .put = mntns_put, .install = mntns_install, .owner = mntns_owner, }; #ifdef CONFIG_SYSCTL static const struct ctl_table fs_namespace_sysctls[] = { { .procname = "mount-max", .data = &sysctl_mount_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, }; static int __init init_fs_namespace_sysctls(void) { register_sysctl_init("fs", fs_namespace_sysctls); return 0; } fs_initcall(init_fs_namespace_sysctls); #endif /* CONFIG_SYSCTL */
2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 // SPDX-License-Identifier: GPL-2.0 /* * Host bridge related code */ #include <linux/kernel.h> #include <linux/pci.h> #include <linux/module.h> #include "pci.h" static struct pci_bus *find_pci_root_bus(struct pci_bus *bus) { while (bus->parent) bus = bus->parent; return bus; } struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus) { struct pci_bus *root_bus = find_pci_root_bus(bus); return to_pci_host_bridge(root_bus->bridge); } EXPORT_SYMBOL_GPL(pci_find_host_bridge); struct device *pci_get_host_bridge_device(struct pci_dev *dev) { struct pci_bus *root_bus = find_pci_root_bus(dev->bus); struct device *bridge = root_bus->bridge; kobject_get(&bridge->kobj); return bridge; } void pci_put_host_bridge_device(struct device *dev) { kobject_put(&dev->kobj); } void pci_set_host_bridge_release(struct pci_host_bridge *bridge, void (*release_fn)(struct pci_host_bridge *), void *release_data) { bridge->release_fn = release_fn; bridge->release_data = release_data; } EXPORT_SYMBOL_GPL(pci_set_host_bridge_release); void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region, struct resource *res) { struct pci_host_bridge *bridge = pci_find_host_bridge(bus); struct resource_entry *window; resource_size_t offset = 0; resource_list_for_each_entry(window, &bridge->windows) { if (resource_contains(window->res, res)) { offset = window->offset; break; } } region->start = res->start - offset; region->end = res->end - offset; } EXPORT_SYMBOL(pcibios_resource_to_bus); static bool region_contains(struct pci_bus_region *region1, struct pci_bus_region *region2) { return region1->start <= region2->start && region1->end >= region2->end; } void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res, struct pci_bus_region *region) { struct pci_host_bridge *bridge = pci_find_host_bridge(bus); struct resource_entry *window; resource_size_t offset = 0; resource_list_for_each_entry(window, &bridge->windows) { struct pci_bus_region bus_region; if (resource_type(res) != resource_type(window->res)) continue; bus_region.start = window->res->start - window->offset; bus_region.end = window->res->end - window->offset; if (region_contains(&bus_region, region)) { offset = window->offset; break; } } res->start = region->start + offset; res->end = region->end + offset; } EXPORT_SYMBOL(pcibios_bus_to_resource);
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 // SPDX-License-Identifier: GPL-2.0-or-later // // core.c -- Voltage/Current Regulator framework. // // Copyright 2007, 2008 Wolfson Microelectronics PLC. // Copyright 2008 SlimLogic Ltd. // // Author: Liam Girdwood <lrg@slimlogic.co.uk> #include <linux/kernel.h> #include <linux/init.h> #include <linux/debugfs.h> #include <linux/device.h> #include <linux/slab.h> #include <linux/async.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/suspend.h> #include <linux/delay.h> #include <linux/gpio/consumer.h> #include <linux/of.h> #include <linux/reboot.h> #include <linux/regmap.h> #include <linux/regulator/of_regulator.h> #include <linux/regulator/consumer.h> #include <linux/regulator/coupler.h> #include <linux/regulator/driver.h> #include <linux/regulator/machine.h> #include <linux/module.h> #define CREATE_TRACE_POINTS #include <trace/events/regulator.h> #include "dummy.h" #include "internal.h" #include "regnl.h" static DEFINE_WW_CLASS(regulator_ww_class); static DEFINE_MUTEX(regulator_nesting_mutex); static DEFINE_MUTEX(regulator_list_mutex); static LIST_HEAD(regulator_map_list); static LIST_HEAD(regulator_ena_gpio_list); static LIST_HEAD(regulator_supply_alias_list); static LIST_HEAD(regulator_coupler_list); static bool has_full_constraints; static struct dentry *debugfs_root; /* * struct regulator_map * * Used to provide symbolic supply names to devices. */ struct regulator_map { struct list_head list; const char *dev_name; /* The dev_name() for the consumer */ const char *supply; struct regulator_dev *regulator; }; /* * struct regulator_enable_gpio * * Management for shared enable GPIO pin */ struct regulator_enable_gpio { struct list_head list; struct gpio_desc *gpiod; u32 enable_count; /* a number of enabled shared GPIO */ u32 request_count; /* a number of requested shared GPIO */ }; /* * struct regulator_supply_alias * * Used to map lookups for a supply onto an alternative device. */ struct regulator_supply_alias { struct list_head list; struct device *src_dev; const char *src_supply; struct device *alias_dev; const char *alias_supply; }; static int _regulator_is_enabled(struct regulator_dev *rdev); static int _regulator_disable(struct regulator *regulator); static int _regulator_get_error_flags(struct regulator_dev *rdev, unsigned int *flags); static int _regulator_get_current_limit(struct regulator_dev *rdev); static unsigned int _regulator_get_mode(struct regulator_dev *rdev); static int _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data); static int _regulator_do_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV); static int regulator_balance_voltage(struct regulator_dev *rdev, suspend_state_t state); static struct regulator *create_regulator(struct regulator_dev *rdev, struct device *dev, const char *supply_name); static void destroy_regulator(struct regulator *regulator); static void _regulator_put(struct regulator *regulator); const char *rdev_get_name(struct regulator_dev *rdev) { if (rdev->constraints && rdev->constraints->name) return rdev->constraints->name; else if (rdev->desc->name) return rdev->desc->name; else return ""; } EXPORT_SYMBOL_GPL(rdev_get_name); static bool have_full_constraints(void) { return has_full_constraints || of_have_populated_dt(); } static bool regulator_ops_is_valid(struct regulator_dev *rdev, int ops) { if (!rdev->constraints) { rdev_err(rdev, "no constraints\n"); return false; } if (rdev->constraints->valid_ops_mask & ops) return true; return false; } /** * regulator_lock_nested - lock a single regulator * @rdev: regulator source * @ww_ctx: w/w mutex acquire context * * This function can be called many times by one task on * a single regulator and its mutex will be locked only * once. If a task, which is calling this function is other * than the one, which initially locked the mutex, it will * wait on mutex. * * Return: 0 on success or a negative error number on failure. */ static inline int regulator_lock_nested(struct regulator_dev *rdev, struct ww_acquire_ctx *ww_ctx) { bool lock = false; int ret = 0; mutex_lock(&regulator_nesting_mutex); if (!ww_mutex_trylock(&rdev->mutex, ww_ctx)) { if (rdev->mutex_owner == current) rdev->ref_cnt++; else lock = true; if (lock) { mutex_unlock(&regulator_nesting_mutex); ret = ww_mutex_lock(&rdev->mutex, ww_ctx); mutex_lock(&regulator_nesting_mutex); } } else { lock = true; } if (lock && ret != -EDEADLK) { rdev->ref_cnt++; rdev->mutex_owner = current; } mutex_unlock(&regulator_nesting_mutex); return ret; } /** * regulator_lock - lock a single regulator * @rdev: regulator source * * This function can be called many times by one task on * a single regulator and its mutex will be locked only * once. If a task, which is calling this function is other * than the one, which initially locked the mutex, it will * wait on mutex. */ static void regulator_lock(struct regulator_dev *rdev) { regulator_lock_nested(rdev, NULL); } /** * regulator_unlock - unlock a single regulator * @rdev: regulator_source * * This function unlocks the mutex when the * reference counter reaches 0. */ static void regulator_unlock(struct regulator_dev *rdev) { mutex_lock(&regulator_nesting_mutex); if (--rdev->ref_cnt == 0) { rdev->mutex_owner = NULL; ww_mutex_unlock(&rdev->mutex); } WARN_ON_ONCE(rdev->ref_cnt < 0); mutex_unlock(&regulator_nesting_mutex); } /** * regulator_lock_two - lock two regulators * @rdev1: first regulator * @rdev2: second regulator * @ww_ctx: w/w mutex acquire context * * Locks both rdevs using the regulator_ww_class. */ static void regulator_lock_two(struct regulator_dev *rdev1, struct regulator_dev *rdev2, struct ww_acquire_ctx *ww_ctx) { struct regulator_dev *held, *contended; int ret; ww_acquire_init(ww_ctx, &regulator_ww_class); /* Try to just grab both of them */ ret = regulator_lock_nested(rdev1, ww_ctx); WARN_ON(ret); ret = regulator_lock_nested(rdev2, ww_ctx); if (ret != -EDEADLOCK) { WARN_ON(ret); goto exit; } held = rdev1; contended = rdev2; while (true) { regulator_unlock(held); ww_mutex_lock_slow(&contended->mutex, ww_ctx); contended->ref_cnt++; contended->mutex_owner = current; swap(held, contended); ret = regulator_lock_nested(contended, ww_ctx); if (ret != -EDEADLOCK) { WARN_ON(ret); break; } } exit: ww_acquire_done(ww_ctx); } /** * regulator_unlock_two - unlock two regulators * @rdev1: first regulator * @rdev2: second regulator * @ww_ctx: w/w mutex acquire context * * The inverse of regulator_lock_two(). */ static void regulator_unlock_two(struct regulator_dev *rdev1, struct regulator_dev *rdev2, struct ww_acquire_ctx *ww_ctx) { regulator_unlock(rdev2); regulator_unlock(rdev1); ww_acquire_fini(ww_ctx); } static bool regulator_supply_is_couple(struct regulator_dev *rdev) { struct regulator_dev *c_rdev; int i; for (i = 1; i < rdev->coupling_desc.n_coupled; i++) { c_rdev = rdev->coupling_desc.coupled_rdevs[i]; if (rdev->supply->rdev == c_rdev) return true; } return false; } static void regulator_unlock_recursive(struct regulator_dev *rdev, unsigned int n_coupled) { struct regulator_dev *c_rdev, *supply_rdev; int i, supply_n_coupled; for (i = n_coupled; i > 0; i--) { c_rdev = rdev->coupling_desc.coupled_rdevs[i - 1]; if (!c_rdev) continue; if (c_rdev->supply && !regulator_supply_is_couple(c_rdev)) { supply_rdev = c_rdev->supply->rdev; supply_n_coupled = supply_rdev->coupling_desc.n_coupled; regulator_unlock_recursive(supply_rdev, supply_n_coupled); } regulator_unlock(c_rdev); } } static int regulator_lock_recursive(struct regulator_dev *rdev, struct regulator_dev **new_contended_rdev, struct regulator_dev **old_contended_rdev, struct ww_acquire_ctx *ww_ctx) { struct regulator_dev *c_rdev; int i, err; for (i = 0; i < rdev->coupling_desc.n_coupled; i++) { c_rdev = rdev->coupling_desc.coupled_rdevs[i]; if (!c_rdev) continue; if (c_rdev != *old_contended_rdev) { err = regulator_lock_nested(c_rdev, ww_ctx); if (err) { if (err == -EDEADLK) { *new_contended_rdev = c_rdev; goto err_unlock; } /* shouldn't happen */ WARN_ON_ONCE(err != -EALREADY); } } else { *old_contended_rdev = NULL; } if (c_rdev->supply && !regulator_supply_is_couple(c_rdev)) { err = regulator_lock_recursive(c_rdev->supply->rdev, new_contended_rdev, old_contended_rdev, ww_ctx); if (err) { regulator_unlock(c_rdev); goto err_unlock; } } } return 0; err_unlock: regulator_unlock_recursive(rdev, i); return err; } /** * regulator_unlock_dependent - unlock regulator's suppliers and coupled * regulators * @rdev: regulator source * @ww_ctx: w/w mutex acquire context * * Unlock all regulators related with rdev by coupling or supplying. */ static void regulator_unlock_dependent(struct regulator_dev *rdev, struct ww_acquire_ctx *ww_ctx) { regulator_unlock_recursive(rdev, rdev->coupling_desc.n_coupled); ww_acquire_fini(ww_ctx); } /** * regulator_lock_dependent - lock regulator's suppliers and coupled regulators * @rdev: regulator source * @ww_ctx: w/w mutex acquire context * * This function as a wrapper on regulator_lock_recursive(), which locks * all regulators related with rdev by coupling or supplying. */ static void regulator_lock_dependent(struct regulator_dev *rdev, struct ww_acquire_ctx *ww_ctx) { struct regulator_dev *new_contended_rdev = NULL; struct regulator_dev *old_contended_rdev = NULL; int err; mutex_lock(&regulator_list_mutex); ww_acquire_init(ww_ctx, &regulator_ww_class); do { if (new_contended_rdev) { ww_mutex_lock_slow(&new_contended_rdev->mutex, ww_ctx); old_contended_rdev = new_contended_rdev; old_contended_rdev->ref_cnt++; old_contended_rdev->mutex_owner = current; } err = regulator_lock_recursive(rdev, &new_contended_rdev, &old_contended_rdev, ww_ctx); if (old_contended_rdev) regulator_unlock(old_contended_rdev); } while (err == -EDEADLK); ww_acquire_done(ww_ctx); mutex_unlock(&regulator_list_mutex); } /* Platform voltage constraint check */ int regulator_check_voltage(struct regulator_dev *rdev, int *min_uV, int *max_uV) { BUG_ON(*min_uV > *max_uV); if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) { rdev_err(rdev, "voltage operation not allowed\n"); return -EPERM; } if (*max_uV > rdev->constraints->max_uV) *max_uV = rdev->constraints->max_uV; if (*min_uV < rdev->constraints->min_uV) *min_uV = rdev->constraints->min_uV; if (*min_uV > *max_uV) { rdev_err(rdev, "unsupportable voltage range: %d-%duV\n", *min_uV, *max_uV); return -EINVAL; } return 0; } /* return 0 if the state is valid */ static int regulator_check_states(suspend_state_t state) { return (state > PM_SUSPEND_MAX || state == PM_SUSPEND_TO_IDLE); } /* Make sure we select a voltage that suits the needs of all * regulator consumers */ int regulator_check_consumers(struct regulator_dev *rdev, int *min_uV, int *max_uV, suspend_state_t state) { struct regulator *regulator; struct regulator_voltage *voltage; list_for_each_entry(regulator, &rdev->consumer_list, list) { voltage = &regulator->voltage[state]; /* * Assume consumers that didn't say anything are OK * with anything in the constraint range. */ if (!voltage->min_uV && !voltage->max_uV) continue; if (*max_uV > voltage->max_uV) *max_uV = voltage->max_uV; if (*min_uV < voltage->min_uV) *min_uV = voltage->min_uV; } if (*min_uV > *max_uV) { rdev_err(rdev, "Restricting voltage, %u-%uuV\n", *min_uV, *max_uV); return -EINVAL; } return 0; } /* current constraint check */ static int regulator_check_current_limit(struct regulator_dev *rdev, int *min_uA, int *max_uA) { BUG_ON(*min_uA > *max_uA); if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_CURRENT)) { rdev_err(rdev, "current operation not allowed\n"); return -EPERM; } if (*max_uA > rdev->constraints->max_uA && rdev->constraints->max_uA) *max_uA = rdev->constraints->max_uA; if (*min_uA < rdev->constraints->min_uA) *min_uA = rdev->constraints->min_uA; if (*min_uA > *max_uA) { rdev_err(rdev, "unsupportable current range: %d-%duA\n", *min_uA, *max_uA); return -EINVAL; } return 0; } /* operating mode constraint check */ static int regulator_mode_constrain(struct regulator_dev *rdev, unsigned int *mode) { switch (*mode) { case REGULATOR_MODE_FAST: case REGULATOR_MODE_NORMAL: case REGULATOR_MODE_IDLE: case REGULATOR_MODE_STANDBY: break; default: rdev_err(rdev, "invalid mode %x specified\n", *mode); return -EINVAL; } if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_MODE)) { rdev_err(rdev, "mode operation not allowed\n"); return -EPERM; } /* The modes are bitmasks, the most power hungry modes having * the lowest values. If the requested mode isn't supported * try higher modes. */ while (*mode) { if (rdev->constraints->valid_modes_mask & *mode) return 0; *mode /= 2; } return -EINVAL; } static inline struct regulator_state * regulator_get_suspend_state(struct regulator_dev *rdev, suspend_state_t state) { if (rdev->constraints == NULL) return NULL; switch (state) { case PM_SUSPEND_STANDBY: return &rdev->constraints->state_standby; case PM_SUSPEND_MEM: return &rdev->constraints->state_mem; case PM_SUSPEND_MAX: return &rdev->constraints->state_disk; default: return NULL; } } static const struct regulator_state * regulator_get_suspend_state_check(struct regulator_dev *rdev, suspend_state_t state) { const struct regulator_state *rstate; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return NULL; /* If we have no suspend mode configuration don't set anything; * only warn if the driver implements set_suspend_voltage or * set_suspend_mode callback. */ if (rstate->enabled != ENABLE_IN_SUSPEND && rstate->enabled != DISABLE_IN_SUSPEND) { if (rdev->desc->ops->set_suspend_voltage || rdev->desc->ops->set_suspend_mode) rdev_warn(rdev, "No configuration\n"); return NULL; } return rstate; } static ssize_t microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); int uV; regulator_lock(rdev); uV = regulator_get_voltage_rdev(rdev); regulator_unlock(rdev); if (uV < 0) return uV; return sprintf(buf, "%d\n", uV); } static DEVICE_ATTR_RO(microvolts); static ssize_t microamps_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", _regulator_get_current_limit(rdev)); } static DEVICE_ATTR_RO(microamps); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%s\n", rdev_get_name(rdev)); } static DEVICE_ATTR_RO(name); static const char *regulator_opmode_to_str(int mode) { switch (mode) { case REGULATOR_MODE_FAST: return "fast"; case REGULATOR_MODE_NORMAL: return "normal"; case REGULATOR_MODE_IDLE: return "idle"; case REGULATOR_MODE_STANDBY: return "standby"; } return "unknown"; } static ssize_t regulator_print_opmode(char *buf, int mode) { return sprintf(buf, "%s\n", regulator_opmode_to_str(mode)); } static ssize_t opmode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_opmode(buf, _regulator_get_mode(rdev)); } static DEVICE_ATTR_RO(opmode); static ssize_t regulator_print_state(char *buf, int state) { if (state > 0) return sprintf(buf, "enabled\n"); else if (state == 0) return sprintf(buf, "disabled\n"); else return sprintf(buf, "unknown\n"); } static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); ssize_t ret; regulator_lock(rdev); ret = regulator_print_state(buf, _regulator_is_enabled(rdev)); regulator_unlock(rdev); return ret; } static DEVICE_ATTR_RO(state); static ssize_t status_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); int status; char *label; status = rdev->desc->ops->get_status(rdev); if (status < 0) return status; switch (status) { case REGULATOR_STATUS_OFF: label = "off"; break; case REGULATOR_STATUS_ON: label = "on"; break; case REGULATOR_STATUS_ERROR: label = "error"; break; case REGULATOR_STATUS_FAST: label = "fast"; break; case REGULATOR_STATUS_NORMAL: label = "normal"; break; case REGULATOR_STATUS_IDLE: label = "idle"; break; case REGULATOR_STATUS_STANDBY: label = "standby"; break; case REGULATOR_STATUS_BYPASS: label = "bypass"; break; case REGULATOR_STATUS_UNDEFINED: label = "undefined"; break; default: return -ERANGE; } return sprintf(buf, "%s\n", label); } static DEVICE_ATTR_RO(status); static ssize_t min_microamps_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); if (!rdev->constraints) return sprintf(buf, "constraint not defined\n"); return sprintf(buf, "%d\n", rdev->constraints->min_uA); } static DEVICE_ATTR_RO(min_microamps); static ssize_t max_microamps_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); if (!rdev->constraints) return sprintf(buf, "constraint not defined\n"); return sprintf(buf, "%d\n", rdev->constraints->max_uA); } static DEVICE_ATTR_RO(max_microamps); static ssize_t min_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); if (!rdev->constraints) return sprintf(buf, "constraint not defined\n"); return sprintf(buf, "%d\n", rdev->constraints->min_uV); } static DEVICE_ATTR_RO(min_microvolts); static ssize_t max_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); if (!rdev->constraints) return sprintf(buf, "constraint not defined\n"); return sprintf(buf, "%d\n", rdev->constraints->max_uV); } static DEVICE_ATTR_RO(max_microvolts); static ssize_t requested_microamps_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); struct regulator *regulator; int uA = 0; regulator_lock(rdev); list_for_each_entry(regulator, &rdev->consumer_list, list) { if (regulator->enable_count) uA += regulator->uA_load; } regulator_unlock(rdev); return sprintf(buf, "%d\n", uA); } static DEVICE_ATTR_RO(requested_microamps); static ssize_t num_users_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->use_count); } static DEVICE_ATTR_RO(num_users); static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); switch (rdev->desc->type) { case REGULATOR_VOLTAGE: return sprintf(buf, "voltage\n"); case REGULATOR_CURRENT: return sprintf(buf, "current\n"); } return sprintf(buf, "unknown\n"); } static DEVICE_ATTR_RO(type); static ssize_t suspend_mem_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->constraints->state_mem.uV); } static DEVICE_ATTR_RO(suspend_mem_microvolts); static ssize_t suspend_disk_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->constraints->state_disk.uV); } static DEVICE_ATTR_RO(suspend_disk_microvolts); static ssize_t suspend_standby_microvolts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->constraints->state_standby.uV); } static DEVICE_ATTR_RO(suspend_standby_microvolts); static ssize_t suspend_mem_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_opmode(buf, rdev->constraints->state_mem.mode); } static DEVICE_ATTR_RO(suspend_mem_mode); static ssize_t suspend_disk_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_opmode(buf, rdev->constraints->state_disk.mode); } static DEVICE_ATTR_RO(suspend_disk_mode); static ssize_t suspend_standby_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_opmode(buf, rdev->constraints->state_standby.mode); } static DEVICE_ATTR_RO(suspend_standby_mode); static ssize_t suspend_mem_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_state(buf, rdev->constraints->state_mem.enabled); } static DEVICE_ATTR_RO(suspend_mem_state); static ssize_t suspend_disk_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_state(buf, rdev->constraints->state_disk.enabled); } static DEVICE_ATTR_RO(suspend_disk_state); static ssize_t suspend_standby_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return regulator_print_state(buf, rdev->constraints->state_standby.enabled); } static DEVICE_ATTR_RO(suspend_standby_state); static ssize_t bypass_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); const char *report; bool bypass; int ret; ret = rdev->desc->ops->get_bypass(rdev, &bypass); if (ret != 0) report = "unknown"; else if (bypass) report = "enabled"; else report = "disabled"; return sprintf(buf, "%s\n", report); } static DEVICE_ATTR_RO(bypass); static ssize_t power_budget_milliwatt_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->constraints->pw_budget_mW); } static DEVICE_ATTR_RO(power_budget_milliwatt); static ssize_t power_requested_milliwatt_show(struct device *dev, struct device_attribute *attr, char *buf) { struct regulator_dev *rdev = dev_get_drvdata(dev); return sprintf(buf, "%d\n", rdev->pw_requested_mW); } static DEVICE_ATTR_RO(power_requested_milliwatt); #define REGULATOR_ERROR_ATTR(name, bit) \ static ssize_t name##_show(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ int ret; \ unsigned int flags; \ struct regulator_dev *rdev = dev_get_drvdata(dev); \ ret = _regulator_get_error_flags(rdev, &flags); \ if (ret) \ return ret; \ return sysfs_emit(buf, "%d\n", !!(flags & (bit))); \ } \ static DEVICE_ATTR_RO(name) REGULATOR_ERROR_ATTR(under_voltage, REGULATOR_ERROR_UNDER_VOLTAGE); REGULATOR_ERROR_ATTR(over_current, REGULATOR_ERROR_OVER_CURRENT); REGULATOR_ERROR_ATTR(regulation_out, REGULATOR_ERROR_REGULATION_OUT); REGULATOR_ERROR_ATTR(fail, REGULATOR_ERROR_FAIL); REGULATOR_ERROR_ATTR(over_temp, REGULATOR_ERROR_OVER_TEMP); REGULATOR_ERROR_ATTR(under_voltage_warn, REGULATOR_ERROR_UNDER_VOLTAGE_WARN); REGULATOR_ERROR_ATTR(over_current_warn, REGULATOR_ERROR_OVER_CURRENT_WARN); REGULATOR_ERROR_ATTR(over_voltage_warn, REGULATOR_ERROR_OVER_VOLTAGE_WARN); REGULATOR_ERROR_ATTR(over_temp_warn, REGULATOR_ERROR_OVER_TEMP_WARN); /* Calculate the new optimum regulator operating mode based on the new total * consumer load. All locks held by caller */ static int drms_uA_update(struct regulator_dev *rdev) { struct regulator *sibling; int current_uA = 0, output_uV, input_uV, err; unsigned int mode; /* * first check to see if we can set modes at all, otherwise just * tell the consumer everything is OK. */ if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_DRMS)) { rdev_dbg(rdev, "DRMS operation not allowed\n"); return 0; } if (!rdev->desc->ops->get_optimum_mode && !rdev->desc->ops->set_load) return 0; if (!rdev->desc->ops->set_mode && !rdev->desc->ops->set_load) return -EINVAL; /* calc total requested load */ list_for_each_entry(sibling, &rdev->consumer_list, list) { if (sibling->enable_count) current_uA += sibling->uA_load; } current_uA += rdev->constraints->system_load; if (rdev->desc->ops->set_load) { /* set the optimum mode for our new total regulator load */ err = rdev->desc->ops->set_load(rdev, current_uA); if (err < 0) rdev_err(rdev, "failed to set load %d: %pe\n", current_uA, ERR_PTR(err)); } else { /* * Unfortunately in some cases the constraints->valid_ops has * REGULATOR_CHANGE_DRMS but there are no valid modes listed. * That's not really legit but we won't consider it a fatal * error here. We'll treat it as if REGULATOR_CHANGE_DRMS * wasn't set. */ if (!rdev->constraints->valid_modes_mask) { rdev_dbg(rdev, "Can change modes; but no valid mode\n"); return 0; } /* get output voltage */ output_uV = regulator_get_voltage_rdev(rdev); /* * Don't return an error; if regulator driver cares about * output_uV then it's up to the driver to validate. */ if (output_uV <= 0) rdev_dbg(rdev, "invalid output voltage found\n"); /* get input voltage */ input_uV = 0; if (rdev->supply) input_uV = regulator_get_voltage_rdev(rdev->supply->rdev); if (input_uV <= 0) input_uV = rdev->constraints->input_uV; /* * Don't return an error; if regulator driver cares about * input_uV then it's up to the driver to validate. */ if (input_uV <= 0) rdev_dbg(rdev, "invalid input voltage found\n"); /* now get the optimum mode for our new total regulator load */ mode = rdev->desc->ops->get_optimum_mode(rdev, input_uV, output_uV, current_uA); /* check the new mode is allowed */ err = regulator_mode_constrain(rdev, &mode); if (err < 0) { rdev_err(rdev, "failed to get optimum mode @ %d uA %d -> %d uV: %pe\n", current_uA, input_uV, output_uV, ERR_PTR(err)); return err; } err = rdev->desc->ops->set_mode(rdev, mode); if (err < 0) rdev_err(rdev, "failed to set optimum mode %x: %pe\n", mode, ERR_PTR(err)); } return err; } static int __suspend_set_state(struct regulator_dev *rdev, const struct regulator_state *rstate) { int ret = 0; if (rstate->enabled == ENABLE_IN_SUSPEND && rdev->desc->ops->set_suspend_enable) ret = rdev->desc->ops->set_suspend_enable(rdev); else if (rstate->enabled == DISABLE_IN_SUSPEND && rdev->desc->ops->set_suspend_disable) ret = rdev->desc->ops->set_suspend_disable(rdev); else /* OK if set_suspend_enable or set_suspend_disable is NULL */ ret = 0; if (ret < 0) { rdev_err(rdev, "failed to enabled/disable: %pe\n", ERR_PTR(ret)); return ret; } if (rdev->desc->ops->set_suspend_voltage && rstate->uV > 0) { ret = rdev->desc->ops->set_suspend_voltage(rdev, rstate->uV); if (ret < 0) { rdev_err(rdev, "failed to set voltage: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->desc->ops->set_suspend_mode && rstate->mode > 0) { ret = rdev->desc->ops->set_suspend_mode(rdev, rstate->mode); if (ret < 0) { rdev_err(rdev, "failed to set mode: %pe\n", ERR_PTR(ret)); return ret; } } return ret; } static int suspend_set_initial_state(struct regulator_dev *rdev) { const struct regulator_state *rstate; rstate = regulator_get_suspend_state_check(rdev, rdev->constraints->initial_state); if (!rstate) return 0; return __suspend_set_state(rdev, rstate); } #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) static void print_constraints_debug(struct regulator_dev *rdev) { struct regulation_constraints *constraints = rdev->constraints; char buf[160] = ""; size_t len = sizeof(buf) - 1; int count = 0; int ret; if (constraints->min_uV && constraints->max_uV) { if (constraints->min_uV == constraints->max_uV) count += scnprintf(buf + count, len - count, "%d mV ", constraints->min_uV / 1000); else count += scnprintf(buf + count, len - count, "%d <--> %d mV ", constraints->min_uV / 1000, constraints->max_uV / 1000); } if (!constraints->min_uV || constraints->min_uV != constraints->max_uV) { ret = regulator_get_voltage_rdev(rdev); if (ret > 0) count += scnprintf(buf + count, len - count, "at %d mV ", ret / 1000); } if (constraints->uV_offset) count += scnprintf(buf + count, len - count, "%dmV offset ", constraints->uV_offset / 1000); if (constraints->min_uA && constraints->max_uA) { if (constraints->min_uA == constraints->max_uA) count += scnprintf(buf + count, len - count, "%d mA ", constraints->min_uA / 1000); else count += scnprintf(buf + count, len - count, "%d <--> %d mA ", constraints->min_uA / 1000, constraints->max_uA / 1000); } if (!constraints->min_uA || constraints->min_uA != constraints->max_uA) { ret = _regulator_get_current_limit(rdev); if (ret > 0) count += scnprintf(buf + count, len - count, "at %d mA ", ret / 1000); } if (constraints->valid_modes_mask & REGULATOR_MODE_FAST) count += scnprintf(buf + count, len - count, "fast "); if (constraints->valid_modes_mask & REGULATOR_MODE_NORMAL) count += scnprintf(buf + count, len - count, "normal "); if (constraints->valid_modes_mask & REGULATOR_MODE_IDLE) count += scnprintf(buf + count, len - count, "idle "); if (constraints->valid_modes_mask & REGULATOR_MODE_STANDBY) count += scnprintf(buf + count, len - count, "standby "); if (constraints->pw_budget_mW) count += scnprintf(buf + count, len - count, "%d mW budget", constraints->pw_budget_mW); if (!count) count = scnprintf(buf, len, "no parameters"); else --count; count += scnprintf(buf + count, len - count, ", %s", _regulator_is_enabled(rdev) ? "enabled" : "disabled"); rdev_dbg(rdev, "%s\n", buf); } #else /* !DEBUG && !CONFIG_DYNAMIC_DEBUG */ static inline void print_constraints_debug(struct regulator_dev *rdev) {} #endif /* !DEBUG && !CONFIG_DYNAMIC_DEBUG */ static void print_constraints(struct regulator_dev *rdev) { struct regulation_constraints *constraints = rdev->constraints; print_constraints_debug(rdev); if ((constraints->min_uV != constraints->max_uV) && !regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) rdev_warn(rdev, "Voltage range but no REGULATOR_CHANGE_VOLTAGE\n"); } static int machine_constraints_voltage(struct regulator_dev *rdev, struct regulation_constraints *constraints) { const struct regulator_ops *ops = rdev->desc->ops; int ret; /* do we need to apply the constraint voltage */ if (rdev->constraints->apply_uV && rdev->constraints->min_uV && rdev->constraints->max_uV) { int target_min, target_max; int current_uV = regulator_get_voltage_rdev(rdev); if (current_uV == -ENOTRECOVERABLE) { /* This regulator can't be read and must be initialized */ rdev_info(rdev, "Setting %d-%duV\n", rdev->constraints->min_uV, rdev->constraints->max_uV); _regulator_do_set_voltage(rdev, rdev->constraints->min_uV, rdev->constraints->max_uV); current_uV = regulator_get_voltage_rdev(rdev); } if (current_uV < 0) { if (current_uV != -EPROBE_DEFER) rdev_err(rdev, "failed to get the current voltage: %pe\n", ERR_PTR(current_uV)); return current_uV; } /* * If we're below the minimum voltage move up to the * minimum voltage, if we're above the maximum voltage * then move down to the maximum. */ target_min = current_uV; target_max = current_uV; if (current_uV < rdev->constraints->min_uV) { target_min = rdev->constraints->min_uV; target_max = rdev->constraints->min_uV; } if (current_uV > rdev->constraints->max_uV) { target_min = rdev->constraints->max_uV; target_max = rdev->constraints->max_uV; } if (target_min != current_uV || target_max != current_uV) { rdev_info(rdev, "Bringing %duV into %d-%duV\n", current_uV, target_min, target_max); ret = _regulator_do_set_voltage( rdev, target_min, target_max); if (ret < 0) { rdev_err(rdev, "failed to apply %d-%duV constraint: %pe\n", target_min, target_max, ERR_PTR(ret)); return ret; } } } /* constrain machine-level voltage specs to fit * the actual range supported by this regulator. */ if (ops->list_voltage && rdev->desc->n_voltages) { int count = rdev->desc->n_voltages; int i; int min_uV = INT_MAX; int max_uV = INT_MIN; int cmin = constraints->min_uV; int cmax = constraints->max_uV; /* it's safe to autoconfigure fixed-voltage supplies * and the constraints are used by list_voltage. */ if (count == 1 && !cmin) { cmin = 1; cmax = INT_MAX; constraints->min_uV = cmin; constraints->max_uV = cmax; } /* voltage constraints are optional */ if ((cmin == 0) && (cmax == 0)) return 0; /* else require explicit machine-level constraints */ if (cmin <= 0 || cmax <= 0 || cmax < cmin) { rdev_err(rdev, "invalid voltage constraints\n"); return -EINVAL; } /* no need to loop voltages if range is continuous */ if (rdev->desc->continuous_voltage_range) return 0; /* initial: [cmin..cmax] valid, [min_uV..max_uV] not */ for (i = 0; i < count; i++) { int value; value = ops->list_voltage(rdev, i); if (value <= 0) continue; /* maybe adjust [min_uV..max_uV] */ if (value >= cmin && value < min_uV) min_uV = value; if (value <= cmax && value > max_uV) max_uV = value; } /* final: [min_uV..max_uV] valid iff constraints valid */ if (max_uV < min_uV) { rdev_err(rdev, "unsupportable voltage constraints %u-%uuV\n", min_uV, max_uV); return -EINVAL; } /* use regulator's subset of machine constraints */ if (constraints->min_uV < min_uV) { rdev_dbg(rdev, "override min_uV, %d -> %d\n", constraints->min_uV, min_uV); constraints->min_uV = min_uV; } if (constraints->max_uV > max_uV) { rdev_dbg(rdev, "override max_uV, %d -> %d\n", constraints->max_uV, max_uV); constraints->max_uV = max_uV; } } return 0; } static int machine_constraints_current(struct regulator_dev *rdev, struct regulation_constraints *constraints) { const struct regulator_ops *ops = rdev->desc->ops; int ret; if (!constraints->min_uA && !constraints->max_uA) return 0; if (constraints->min_uA > constraints->max_uA) { rdev_err(rdev, "Invalid current constraints\n"); return -EINVAL; } if (!ops->set_current_limit || !ops->get_current_limit) { rdev_warn(rdev, "Operation of current configuration missing\n"); return 0; } /* Set regulator current in constraints range */ ret = ops->set_current_limit(rdev, constraints->min_uA, constraints->max_uA); if (ret < 0) { rdev_err(rdev, "Failed to set current constraint, %d\n", ret); return ret; } return 0; } static int _regulator_do_enable(struct regulator_dev *rdev); static int notif_set_limit(struct regulator_dev *rdev, int (*set)(struct regulator_dev *, int, int, bool), int limit, int severity) { bool enable; if (limit == REGULATOR_NOTIF_LIMIT_DISABLE) { enable = false; limit = 0; } else { enable = true; } if (limit == REGULATOR_NOTIF_LIMIT_ENABLE) limit = 0; return set(rdev, limit, severity, enable); } static int handle_notify_limits(struct regulator_dev *rdev, int (*set)(struct regulator_dev *, int, int, bool), struct notification_limit *limits) { int ret = 0; if (!set) return -EOPNOTSUPP; if (limits->prot) ret = notif_set_limit(rdev, set, limits->prot, REGULATOR_SEVERITY_PROT); if (ret) return ret; if (limits->err) ret = notif_set_limit(rdev, set, limits->err, REGULATOR_SEVERITY_ERR); if (ret) return ret; if (limits->warn) ret = notif_set_limit(rdev, set, limits->warn, REGULATOR_SEVERITY_WARN); return ret; } /** * set_machine_constraints - sets regulator constraints * @rdev: regulator source * * Allows platform initialisation code to define and constrain * regulator circuits e.g. valid voltage/current ranges, etc. NOTE: * Constraints *must* be set by platform code in order for some * regulator operations to proceed i.e. set_voltage, set_current_limit, * set_mode. * * Return: 0 on success or a negative error number on failure. */ static int set_machine_constraints(struct regulator_dev *rdev) { int ret = 0; const struct regulator_ops *ops = rdev->desc->ops; ret = machine_constraints_voltage(rdev, rdev->constraints); if (ret != 0) return ret; ret = machine_constraints_current(rdev, rdev->constraints); if (ret != 0) return ret; if (rdev->constraints->ilim_uA && ops->set_input_current_limit) { ret = ops->set_input_current_limit(rdev, rdev->constraints->ilim_uA); if (ret < 0) { rdev_err(rdev, "failed to set input limit: %pe\n", ERR_PTR(ret)); return ret; } } /* do we need to setup our suspend state */ if (rdev->constraints->initial_state) { ret = suspend_set_initial_state(rdev); if (ret < 0) { rdev_err(rdev, "failed to set suspend state: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->constraints->initial_mode) { if (!ops->set_mode) { rdev_err(rdev, "no set_mode operation\n"); return -EINVAL; } ret = ops->set_mode(rdev, rdev->constraints->initial_mode); if (ret < 0) { rdev_err(rdev, "failed to set initial mode: %pe\n", ERR_PTR(ret)); return ret; } } else if (rdev->constraints->system_load) { /* * We'll only apply the initial system load if an * initial mode wasn't specified. */ drms_uA_update(rdev); } if ((rdev->constraints->ramp_delay || rdev->constraints->ramp_disable) && ops->set_ramp_delay) { ret = ops->set_ramp_delay(rdev, rdev->constraints->ramp_delay); if (ret < 0) { rdev_err(rdev, "failed to set ramp_delay: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->constraints->pull_down && ops->set_pull_down) { ret = ops->set_pull_down(rdev); if (ret < 0) { rdev_err(rdev, "failed to set pull down: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->constraints->soft_start && ops->set_soft_start) { ret = ops->set_soft_start(rdev); if (ret < 0) { rdev_err(rdev, "failed to set soft start: %pe\n", ERR_PTR(ret)); return ret; } } /* * Existing logic does not warn if over_current_protection is given as * a constraint but driver does not support that. I think we should * warn about this type of issues as it is possible someone changes * PMIC on board to another type - and the another PMIC's driver does * not support setting protection. Board composer may happily believe * the DT limits are respected - especially if the new PMIC HW also * supports protection but the driver does not. I won't change the logic * without hearing more experienced opinion on this though. * * If warning is seen as a good idea then we can merge handling the * over-curret protection and detection and get rid of this special * handling. */ if (rdev->constraints->over_current_protection && ops->set_over_current_protection) { int lim = rdev->constraints->over_curr_limits.prot; ret = ops->set_over_current_protection(rdev, lim, REGULATOR_SEVERITY_PROT, true); if (ret < 0) { rdev_err(rdev, "failed to set over current protection: %pe\n", ERR_PTR(ret)); return ret; } } if (rdev->constraints->over_current_detection) ret = handle_notify_limits(rdev, ops->set_over_current_protection, &rdev->constraints->over_curr_limits); if (ret) { if (ret != -EOPNOTSUPP) { rdev_err(rdev, "failed to set over current limits: %pe\n", ERR_PTR(ret)); return ret; } rdev_warn(rdev, "IC does not support requested over-current limits\n"); } if (rdev->constraints->over_voltage_detection) ret = handle_notify_limits(rdev, ops->set_over_voltage_protection, &rdev->constraints->over_voltage_limits); if (ret) { if (ret != -EOPNOTSUPP) { rdev_err(rdev, "failed to set over voltage limits %pe\n", ERR_PTR(ret)); return ret; } rdev_warn(rdev, "IC does not support requested over voltage limits\n"); } if (rdev->constraints->under_voltage_detection) ret = handle_notify_limits(rdev, ops->set_under_voltage_protection, &rdev->constraints->under_voltage_limits); if (ret) { if (ret != -EOPNOTSUPP) { rdev_err(rdev, "failed to set under voltage limits %pe\n", ERR_PTR(ret)); return ret; } rdev_warn(rdev, "IC does not support requested under voltage limits\n"); } if (rdev->constraints->over_temp_detection) ret = handle_notify_limits(rdev, ops->set_thermal_protection, &rdev->constraints->temp_limits); if (ret) { if (ret != -EOPNOTSUPP) { rdev_err(rdev, "failed to set temperature limits %pe\n", ERR_PTR(ret)); return ret; } rdev_warn(rdev, "IC does not support requested temperature limits\n"); } if (rdev->constraints->active_discharge && ops->set_active_discharge) { bool ad_state = rdev->constraints->active_discharge == REGULATOR_ACTIVE_DISCHARGE_ENABLE; ret = ops->set_active_discharge(rdev, ad_state); if (ret < 0) { rdev_err(rdev, "failed to set active discharge: %pe\n", ERR_PTR(ret)); return ret; } } /* * If there is no mechanism for controlling the regulator then * flag it as always_on so we don't end up duplicating checks * for this so much. Note that we could control the state of * a supply to control the output on a regulator that has no * direct control. */ if (!rdev->ena_pin && !ops->enable) { if (rdev->supply_name && !rdev->supply) return -EPROBE_DEFER; if (rdev->supply) rdev->constraints->always_on = rdev->supply->rdev->constraints->always_on; else rdev->constraints->always_on = true; } /* If the constraints say the regulator should be on at this point * and we have control then make sure it is enabled. */ if (rdev->constraints->always_on || rdev->constraints->boot_on) { /* If we want to enable this regulator, make sure that we know * the supplying regulator. */ if (rdev->supply_name && !rdev->supply) return -EPROBE_DEFER; /* If supplying regulator has already been enabled, * it's not intended to have use_count increment * when rdev is only boot-on. */ if (rdev->supply && (rdev->constraints->always_on || !regulator_is_enabled(rdev->supply))) { ret = regulator_enable(rdev->supply); if (ret < 0) { _regulator_put(rdev->supply); rdev->supply = NULL; return ret; } } ret = _regulator_do_enable(rdev); if (ret < 0 && ret != -EINVAL) { rdev_err(rdev, "failed to enable: %pe\n", ERR_PTR(ret)); return ret; } if (rdev->constraints->always_on) rdev->use_count++; } else if (rdev->desc->off_on_delay) { rdev->last_off = ktime_get(); } if (!rdev->constraints->pw_budget_mW) rdev->constraints->pw_budget_mW = INT_MAX; print_constraints(rdev); return 0; } /** * set_supply - set regulator supply regulator * @rdev: regulator (locked) * @supply_rdev: supply regulator (locked)) * * Called by platform initialisation code to set the supply regulator for this * regulator. This ensures that a regulators supply will also be enabled by the * core if it's child is enabled. * * Return: 0 on success or a negative error number on failure. */ static int set_supply(struct regulator_dev *rdev, struct regulator_dev *supply_rdev) { int err; rdev_dbg(rdev, "supplied by %s\n", rdev_get_name(supply_rdev)); if (!try_module_get(supply_rdev->owner)) return -ENODEV; rdev->supply = create_regulator(supply_rdev, &rdev->dev, "SUPPLY"); if (rdev->supply == NULL) { module_put(supply_rdev->owner); err = -ENOMEM; return err; } supply_rdev->open_count++; return 0; } /** * set_consumer_device_supply - Bind a regulator to a symbolic supply * @rdev: regulator source * @consumer_dev_name: dev_name() string for device supply applies to * @supply: symbolic name for supply * * Allows platform initialisation code to map physical regulator * sources to symbolic names for supplies for use by devices. Devices * should use these symbolic names to request regulators, avoiding the * need to provide board-specific regulator names as platform data. * * Return: 0 on success or a negative error number on failure. */ static int set_consumer_device_supply(struct regulator_dev *rdev, const char *consumer_dev_name, const char *supply) { struct regulator_map *node, *new_node; int has_dev; if (supply == NULL) return -EINVAL; if (consumer_dev_name != NULL) has_dev = 1; else has_dev = 0; new_node = kzalloc(sizeof(struct regulator_map), GFP_KERNEL); if (new_node == NULL) return -ENOMEM; new_node->regulator = rdev; new_node->supply = supply; if (has_dev) { new_node->dev_name = kstrdup(consumer_dev_name, GFP_KERNEL); if (new_node->dev_name == NULL) { kfree(new_node); return -ENOMEM; } } mutex_lock(&regulator_list_mutex); list_for_each_entry(node, &regulator_map_list, list) { if (node->dev_name && consumer_dev_name) { if (strcmp(node->dev_name, consumer_dev_name) != 0) continue; } else if (node->dev_name || consumer_dev_name) { continue; } if (strcmp(node->supply, supply) != 0) continue; pr_debug("%s: %s/%s is '%s' supply; fail %s/%s\n", consumer_dev_name, dev_name(&node->regulator->dev), node->regulator->desc->name, supply, dev_name(&rdev->dev), rdev_get_name(rdev)); goto fail; } list_add(&new_node->list, &regulator_map_list); mutex_unlock(&regulator_list_mutex); return 0; fail: mutex_unlock(&regulator_list_mutex); kfree(new_node->dev_name); kfree(new_node); return -EBUSY; } static void unset_regulator_supplies(struct regulator_dev *rdev) { struct regulator_map *node, *n; list_for_each_entry_safe(node, n, &regulator_map_list, list) { if (rdev == node->regulator) { list_del(&node->list); kfree(node->dev_name); kfree(node); } } } #ifdef CONFIG_DEBUG_FS static ssize_t constraint_flags_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { const struct regulator *regulator = file->private_data; const struct regulation_constraints *c = regulator->rdev->constraints; char *buf; ssize_t ret; if (!c) return 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; ret = snprintf(buf, PAGE_SIZE, "always_on: %u\n" "boot_on: %u\n" "apply_uV: %u\n" "ramp_disable: %u\n" "soft_start: %u\n" "pull_down: %u\n" "over_current_protection: %u\n", c->always_on, c->boot_on, c->apply_uV, c->ramp_disable, c->soft_start, c->pull_down, c->over_current_protection); ret = simple_read_from_buffer(user_buf, count, ppos, buf, ret); kfree(buf); return ret; } #endif static const struct file_operations constraint_flags_fops = { #ifdef CONFIG_DEBUG_FS .open = simple_open, .read = constraint_flags_read_file, .llseek = default_llseek, #endif }; #define REG_STR_SIZE 64 static void link_and_create_debugfs(struct regulator *regulator, struct regulator_dev *rdev, struct device *dev) { int err = 0; if (dev) { regulator->dev = dev; /* Add a link to the device sysfs entry */ err = sysfs_create_link_nowarn(&rdev->dev.kobj, &dev->kobj, regulator->supply_name); if (err) { rdev_dbg(rdev, "could not add device link %s: %pe\n", dev->kobj.name, ERR_PTR(err)); /* non-fatal */ } } if (err != -EEXIST) { regulator->debugfs = debugfs_create_dir(regulator->supply_name, rdev->debugfs); if (IS_ERR(regulator->debugfs)) { rdev_dbg(rdev, "Failed to create debugfs directory\n"); regulator->debugfs = NULL; } } if (regulator->debugfs) { debugfs_create_u32("uA_load", 0444, regulator->debugfs, &regulator->uA_load); debugfs_create_u32("min_uV", 0444, regulator->debugfs, &regulator->voltage[PM_SUSPEND_ON].min_uV); debugfs_create_u32("max_uV", 0444, regulator->debugfs, &regulator->voltage[PM_SUSPEND_ON].max_uV); debugfs_create_file("constraint_flags", 0444, regulator->debugfs, regulator, &constraint_flags_fops); } } static struct regulator *create_regulator(struct regulator_dev *rdev, struct device *dev, const char *supply_name) { struct regulator *regulator; lockdep_assert_held_once(&rdev->mutex.base); if (dev) { char buf[REG_STR_SIZE]; int size; size = snprintf(buf, REG_STR_SIZE, "%s-%s", dev->kobj.name, supply_name); if (size >= REG_STR_SIZE) return NULL; supply_name = kstrdup(buf, GFP_KERNEL); if (supply_name == NULL) return NULL; } else { supply_name = kstrdup_const(supply_name, GFP_KERNEL); if (supply_name == NULL) return NULL; } regulator = kzalloc(sizeof(*regulator), GFP_KERNEL); if (regulator == NULL) { kfree_const(supply_name); return NULL; } regulator->rdev = rdev; regulator->supply_name = supply_name; list_add(&regulator->list, &rdev->consumer_list); /* * Check now if the regulator is an always on regulator - if * it is then we don't need to do nearly so much work for * enable/disable calls. */ if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS) && _regulator_is_enabled(rdev)) regulator->always_on = true; return regulator; } static int _regulator_get_enable_time(struct regulator_dev *rdev) { if (rdev->constraints && rdev->constraints->enable_time) return rdev->constraints->enable_time; if (rdev->desc->ops->enable_time) return rdev->desc->ops->enable_time(rdev); return rdev->desc->enable_time; } static struct regulator_supply_alias *regulator_find_supply_alias( struct device *dev, const char *supply) { struct regulator_supply_alias *map; list_for_each_entry(map, &regulator_supply_alias_list, list) if (map->src_dev == dev && strcmp(map->src_supply, supply) == 0) return map; return NULL; } static void regulator_supply_alias(struct device **dev, const char **supply) { struct regulator_supply_alias *map; map = regulator_find_supply_alias(*dev, *supply); if (map) { dev_dbg(*dev, "Mapping supply %s to %s,%s\n", *supply, map->alias_supply, dev_name(map->alias_dev)); *dev = map->alias_dev; *supply = map->alias_supply; } } static int regulator_match(struct device *dev, const void *data) { struct regulator_dev *r = dev_to_rdev(dev); return strcmp(rdev_get_name(r), data) == 0; } static struct regulator_dev *regulator_lookup_by_name(const char *name) { struct device *dev; dev = class_find_device(&regulator_class, NULL, name, regulator_match); return dev ? dev_to_rdev(dev) : NULL; } static struct regulator_dev *regulator_dt_lookup(struct device *dev, const char *supply) { struct regulator_dev *r = NULL; if (dev_of_node(dev)) { r = of_regulator_dev_lookup(dev, dev_of_node(dev), supply); if (PTR_ERR(r) == -ENODEV) r = NULL; } return r; } /** * regulator_dev_lookup - lookup a regulator device. * @dev: device for regulator "consumer". * @supply: Supply name or regulator ID. * * Return: pointer to &struct regulator_dev or ERR_PTR() encoded negative error number. * * If successful, returns a struct regulator_dev that corresponds to the name * @supply and with the embedded struct device refcount incremented by one. * The refcount must be dropped by calling put_device(). * On failure one of the following ERR_PTR() encoded values is returned: * -%ENODEV if lookup fails permanently, -%EPROBE_DEFER if lookup could succeed * in the future. */ static struct regulator_dev *regulator_dev_lookup(struct device *dev, const char *supply) { struct regulator_dev *r = NULL; struct regulator_map *map; const char *devname = NULL; regulator_supply_alias(&dev, &supply); /* first do a dt based lookup */ r = regulator_dt_lookup(dev, supply); if (r) return r; /* if not found, try doing it non-dt way */ if (dev) devname = dev_name(dev); mutex_lock(&regulator_list_mutex); list_for_each_entry(map, &regulator_map_list, list) { /* If the mapping has a device set up it must match */ if (map->dev_name && (!devname || strcmp(map->dev_name, devname))) continue; if (strcmp(map->supply, supply) == 0 && get_device(&map->regulator->dev)) { r = map->regulator; break; } } mutex_unlock(&regulator_list_mutex); if (r) return r; r = regulator_lookup_by_name(supply); if (r) return r; return ERR_PTR(-ENODEV); } static int regulator_resolve_supply(struct regulator_dev *rdev) { struct regulator_dev *r; struct device *dev = rdev->dev.parent; struct ww_acquire_ctx ww_ctx; int ret = 0; /* No supply to resolve? */ if (!rdev->supply_name) return 0; /* Supply already resolved? (fast-path without locking contention) */ if (rdev->supply) return 0; /* first do a dt based lookup on the node described in the virtual * device. */ r = regulator_dt_lookup(&rdev->dev, rdev->supply_name); /* If regulator not found use usual search path in the parent * device. */ if (!r) r = regulator_dev_lookup(dev, rdev->supply_name); if (IS_ERR(r)) { ret = PTR_ERR(r); /* Did the lookup explicitly defer for us? */ if (ret == -EPROBE_DEFER) goto out; if (have_full_constraints()) { r = dummy_regulator_rdev; if (!r) { ret = -EPROBE_DEFER; goto out; } get_device(&r->dev); } else { dev_err(dev, "Failed to resolve %s-supply for %s\n", rdev->supply_name, rdev->desc->name); ret = -EPROBE_DEFER; goto out; } } if (r == rdev) { dev_err(dev, "Supply for %s (%s) resolved to itself\n", rdev->desc->name, rdev->supply_name); if (!have_full_constraints()) { ret = -EINVAL; goto out; } r = dummy_regulator_rdev; if (!r) { ret = -EPROBE_DEFER; goto out; } get_device(&r->dev); } /* * If the supply's parent device is not the same as the * regulator's parent device, then ensure the parent device * is bound before we resolve the supply, in case the parent * device get probe deferred and unregisters the supply. */ if (r->dev.parent && r->dev.parent != rdev->dev.parent) { if (!device_is_bound(r->dev.parent)) { put_device(&r->dev); ret = -EPROBE_DEFER; goto out; } } /* Recursively resolve the supply of the supply */ ret = regulator_resolve_supply(r); if (ret < 0) { put_device(&r->dev); goto out; } /* * Recheck rdev->supply with rdev->mutex lock held to avoid a race * between rdev->supply null check and setting rdev->supply in * set_supply() from concurrent tasks. */ regulator_lock_two(rdev, r, &ww_ctx); /* Supply just resolved by a concurrent task? */ if (rdev->supply) { regulator_unlock_two(rdev, r, &ww_ctx); put_device(&r->dev); goto out; } ret = set_supply(rdev, r); if (ret < 0) { regulator_unlock_two(rdev, r, &ww_ctx); put_device(&r->dev); goto out; } regulator_unlock_two(rdev, r, &ww_ctx); /* rdev->supply was created in set_supply() */ link_and_create_debugfs(rdev->supply, r, &rdev->dev); /* * In set_machine_constraints() we may have turned this regulator on * but we couldn't propagate to the supply if it hadn't been resolved * yet. Do it now. */ if (rdev->use_count) { ret = regulator_enable(rdev->supply); if (ret < 0) { _regulator_put(rdev->supply); rdev->supply = NULL; goto out; } } out: return ret; } /* common pre-checks for regulator requests */ int _regulator_get_common_check(struct device *dev, const char *id, enum regulator_get_type get_type) { if (get_type >= MAX_GET_TYPE) { dev_err(dev, "invalid type %d in %s\n", get_type, __func__); return -EINVAL; } if (id == NULL) { dev_err(dev, "regulator request with no identifier\n"); return -EINVAL; } return 0; } /** * _regulator_get_common - Common code for regulator requests * @rdev: regulator device pointer as returned by *regulator_dev_lookup() * Its reference count is expected to have been incremented. * @dev: device used for dev_printk messages * @id: Supply name or regulator ID * @get_type: enum regulator_get_type value corresponding to type of request * * Returns: pointer to struct regulator corresponding to @rdev, or ERR_PTR() * encoded error. * * This function should be chained with *regulator_dev_lookup() functions. */ struct regulator *_regulator_get_common(struct regulator_dev *rdev, struct device *dev, const char *id, enum regulator_get_type get_type) { struct regulator *regulator; struct device_link *link; int ret; if (IS_ERR(rdev)) { ret = PTR_ERR(rdev); /* * If regulator_dev_lookup() fails with error other * than -ENODEV our job here is done, we simply return it. */ if (ret != -ENODEV) return ERR_PTR(ret); if (!have_full_constraints()) { dev_warn(dev, "incomplete constraints, dummy supplies not allowed (id=%s)\n", id); return ERR_PTR(-ENODEV); } switch (get_type) { case NORMAL_GET: /* * Assume that a regulator is physically present and * enabled, even if it isn't hooked up, and just * provide a dummy. */ rdev = dummy_regulator_rdev; if (!rdev) return ERR_PTR(-EPROBE_DEFER); dev_warn(dev, "supply %s not found, using dummy regulator\n", id); get_device(&rdev->dev); break; case EXCLUSIVE_GET: dev_warn(dev, "dummy supplies not allowed for exclusive requests (id=%s)\n", id); fallthrough; default: return ERR_PTR(-ENODEV); } } if (rdev->exclusive) { regulator = ERR_PTR(-EPERM); put_device(&rdev->dev); return regulator; } if (get_type == EXCLUSIVE_GET && rdev->open_count) { regulator = ERR_PTR(-EBUSY); put_device(&rdev->dev); return regulator; } mutex_lock(&regulator_list_mutex); ret = (rdev->coupling_desc.n_resolved != rdev->coupling_desc.n_coupled); mutex_unlock(&regulator_list_mutex); if (ret != 0) { regulator = ERR_PTR(-EPROBE_DEFER); put_device(&rdev->dev); return regulator; } ret = regulator_resolve_supply(rdev); if (ret < 0) { regulator = ERR_PTR(ret); put_device(&rdev->dev); return regulator; } if (!try_module_get(rdev->owner)) { regulator = ERR_PTR(-EPROBE_DEFER); put_device(&rdev->dev); return regulator; } regulator_lock(rdev); regulator = create_regulator(rdev, dev, id); regulator_unlock(rdev); if (regulator == NULL) { regulator = ERR_PTR(-ENOMEM); module_put(rdev->owner); put_device(&rdev->dev); return regulator; } link_and_create_debugfs(regulator, rdev, dev); rdev->open_count++; if (get_type == EXCLUSIVE_GET) { rdev->exclusive = 1; ret = _regulator_is_enabled(rdev); if (ret > 0) { rdev->use_count = 1; regulator->enable_count = 1; /* Propagate the regulator state to its supply */ if (rdev->supply) { ret = regulator_enable(rdev->supply); if (ret < 0) { destroy_regulator(regulator); module_put(rdev->owner); put_device(&rdev->dev); return ERR_PTR(ret); } } } else { rdev->use_count = 0; regulator->enable_count = 0; } } link = device_link_add(dev, &rdev->dev, DL_FLAG_STATELESS); if (!IS_ERR_OR_NULL(link)) regulator->device_link = true; return regulator; } /* Internal regulator request function */ struct regulator *_regulator_get(struct device *dev, const char *id, enum regulator_get_type get_type) { struct regulator_dev *rdev; int ret; ret = _regulator_get_common_check(dev, id, get_type); if (ret) return ERR_PTR(ret); rdev = regulator_dev_lookup(dev, id); return _regulator_get_common(rdev, dev, id, get_type); } /** * regulator_get - lookup and obtain a reference to a regulator. * @dev: device for regulator "consumer" * @id: Supply name or regulator ID. * * Use of supply names configured via set_consumer_device_supply() is * strongly encouraged. It is recommended that the supply name used * should match the name used for the supply and/or the relevant * device pins in the datasheet. * * Return: Pointer to a &struct regulator corresponding to the regulator * producer, or an ERR_PTR() encoded negative error number. */ struct regulator *regulator_get(struct device *dev, const char *id) { return _regulator_get(dev, id, NORMAL_GET); } EXPORT_SYMBOL_GPL(regulator_get); /** * regulator_get_exclusive - obtain exclusive access to a regulator. * @dev: device for regulator "consumer" * @id: Supply name or regulator ID. * * Other consumers will be unable to obtain this regulator while this * reference is held and the use count for the regulator will be * initialised to reflect the current state of the regulator. * * This is intended for use by consumers which cannot tolerate shared * use of the regulator such as those which need to force the * regulator off for correct operation of the hardware they are * controlling. * * Use of supply names configured via set_consumer_device_supply() is * strongly encouraged. It is recommended that the supply name used * should match the name used for the supply and/or the relevant * device pins in the datasheet. * * Return: Pointer to a &struct regulator corresponding to the regulator * producer, or an ERR_PTR() encoded negative error number. */ struct regulator *regulator_get_exclusive(struct device *dev, const char *id) { return _regulator_get(dev, id, EXCLUSIVE_GET); } EXPORT_SYMBOL_GPL(regulator_get_exclusive); /** * regulator_get_optional - obtain optional access to a regulator. * @dev: device for regulator "consumer" * @id: Supply name or regulator ID. * * This is intended for use by consumers for devices which can have * some supplies unconnected in normal use, such as some MMC devices. * It can allow the regulator core to provide stub supplies for other * supplies requested using normal regulator_get() calls without * disrupting the operation of drivers that can handle absent * supplies. * * Use of supply names configured via set_consumer_device_supply() is * strongly encouraged. It is recommended that the supply name used * should match the name used for the supply and/or the relevant * device pins in the datasheet. * * Return: Pointer to a &struct regulator corresponding to the regulator * producer, or an ERR_PTR() encoded negative error number. */ struct regulator *regulator_get_optional(struct device *dev, const char *id) { return _regulator_get(dev, id, OPTIONAL_GET); } EXPORT_SYMBOL_GPL(regulator_get_optional); static void destroy_regulator(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; debugfs_remove_recursive(regulator->debugfs); if (regulator->dev) { if (regulator->device_link) device_link_remove(regulator->dev, &rdev->dev); /* remove any sysfs entries */ sysfs_remove_link(&rdev->dev.kobj, regulator->supply_name); } regulator_lock(rdev); list_del(&regulator->list); rdev->open_count--; rdev->exclusive = 0; regulator_unlock(rdev); kfree_const(regulator->supply_name); kfree(regulator); } /* regulator_list_mutex lock held by regulator_put() */ static void _regulator_put(struct regulator *regulator) { struct regulator_dev *rdev; if (IS_ERR_OR_NULL(regulator)) return; lockdep_assert_held_once(&regulator_list_mutex); /* Docs say you must disable before calling regulator_put() */ WARN_ON(regulator->enable_count); rdev = regulator->rdev; destroy_regulator(regulator); module_put(rdev->owner); put_device(&rdev->dev); } /** * regulator_put - "free" the regulator source * @regulator: regulator source * * Note: drivers must ensure that all regulator_enable calls made on this * regulator source are balanced by regulator_disable calls prior to calling * this function. */ void regulator_put(struct regulator *regulator) { mutex_lock(&regulator_list_mutex); _regulator_put(regulator); mutex_unlock(&regulator_list_mutex); } EXPORT_SYMBOL_GPL(regulator_put); /** * regulator_register_supply_alias - Provide device alias for supply lookup * * @dev: device that will be given as the regulator "consumer" * @id: Supply name or regulator ID * @alias_dev: device that should be used to lookup the supply * @alias_id: Supply name or regulator ID that should be used to lookup the * supply * * All lookups for id on dev will instead be conducted for alias_id on * alias_dev. * * Return: 0 on success or a negative error number on failure. */ int regulator_register_supply_alias(struct device *dev, const char *id, struct device *alias_dev, const char *alias_id) { struct regulator_supply_alias *map; map = regulator_find_supply_alias(dev, id); if (map) return -EEXIST; map = kzalloc(sizeof(struct regulator_supply_alias), GFP_KERNEL); if (!map) return -ENOMEM; map->src_dev = dev; map->src_supply = id; map->alias_dev = alias_dev; map->alias_supply = alias_id; list_add(&map->list, &regulator_supply_alias_list); pr_info("Adding alias for supply %s,%s -> %s,%s\n", id, dev_name(dev), alias_id, dev_name(alias_dev)); return 0; } EXPORT_SYMBOL_GPL(regulator_register_supply_alias); /** * regulator_unregister_supply_alias - Remove device alias * * @dev: device that will be given as the regulator "consumer" * @id: Supply name or regulator ID * * Remove a lookup alias if one exists for id on dev. */ void regulator_unregister_supply_alias(struct device *dev, const char *id) { struct regulator_supply_alias *map; map = regulator_find_supply_alias(dev, id); if (map) { list_del(&map->list); kfree(map); } } EXPORT_SYMBOL_GPL(regulator_unregister_supply_alias); /** * regulator_bulk_register_supply_alias - register multiple aliases * * @dev: device that will be given as the regulator "consumer" * @id: List of supply names or regulator IDs * @alias_dev: device that should be used to lookup the supply * @alias_id: List of supply names or regulator IDs that should be used to * lookup the supply * @num_id: Number of aliases to register * * This helper function allows drivers to register several supply * aliases in one operation. If any of the aliases cannot be * registered any aliases that were registered will be removed * before returning to the caller. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_register_supply_alias(struct device *dev, const char *const *id, struct device *alias_dev, const char *const *alias_id, int num_id) { int i; int ret; for (i = 0; i < num_id; ++i) { ret = regulator_register_supply_alias(dev, id[i], alias_dev, alias_id[i]); if (ret < 0) goto err; } return 0; err: dev_err(dev, "Failed to create supply alias %s,%s -> %s,%s\n", id[i], dev_name(dev), alias_id[i], dev_name(alias_dev)); while (--i >= 0) regulator_unregister_supply_alias(dev, id[i]); return ret; } EXPORT_SYMBOL_GPL(regulator_bulk_register_supply_alias); /** * regulator_bulk_unregister_supply_alias - unregister multiple aliases * * @dev: device that will be given as the regulator "consumer" * @id: List of supply names or regulator IDs * @num_id: Number of aliases to unregister * * This helper function allows drivers to unregister several supply * aliases in one operation. */ void regulator_bulk_unregister_supply_alias(struct device *dev, const char *const *id, int num_id) { int i; for (i = 0; i < num_id; ++i) regulator_unregister_supply_alias(dev, id[i]); } EXPORT_SYMBOL_GPL(regulator_bulk_unregister_supply_alias); /* Manage enable GPIO list. Same GPIO pin can be shared among regulators */ static int regulator_ena_gpio_request(struct regulator_dev *rdev, const struct regulator_config *config) { struct regulator_enable_gpio *pin, *new_pin; struct gpio_desc *gpiod; gpiod = config->ena_gpiod; new_pin = kzalloc(sizeof(*new_pin), GFP_KERNEL); mutex_lock(&regulator_list_mutex); list_for_each_entry(pin, &regulator_ena_gpio_list, list) { if (gpiod_is_equal(pin->gpiod, gpiod)) { rdev_dbg(rdev, "GPIO is already used\n"); goto update_ena_gpio_to_rdev; } } if (new_pin == NULL) { mutex_unlock(&regulator_list_mutex); return -ENOMEM; } pin = new_pin; new_pin = NULL; pin->gpiod = gpiod; list_add(&pin->list, &regulator_ena_gpio_list); update_ena_gpio_to_rdev: pin->request_count++; rdev->ena_pin = pin; mutex_unlock(&regulator_list_mutex); kfree(new_pin); return 0; } static void regulator_ena_gpio_free(struct regulator_dev *rdev) { struct regulator_enable_gpio *pin, *n; if (!rdev->ena_pin) return; /* Free the GPIO only in case of no use */ list_for_each_entry_safe(pin, n, &regulator_ena_gpio_list, list) { if (pin != rdev->ena_pin) continue; if (--pin->request_count) break; gpiod_put(pin->gpiod); list_del(&pin->list); kfree(pin); break; } rdev->ena_pin = NULL; } /** * regulator_ena_gpio_ctrl - balance enable_count of each GPIO and actual GPIO pin control * @rdev: regulator_dev structure * @enable: enable GPIO at initial use? * * GPIO is enabled in case of initial use. (enable_count is 0) * GPIO is disabled when it is not shared any more. (enable_count <= 1) * * Return: 0 on success or a negative error number on failure. */ static int regulator_ena_gpio_ctrl(struct regulator_dev *rdev, bool enable) { struct regulator_enable_gpio *pin = rdev->ena_pin; if (!pin) return -EINVAL; if (enable) { /* Enable GPIO at initial use */ if (pin->enable_count == 0) gpiod_set_value_cansleep(pin->gpiod, 1); pin->enable_count++; } else { if (pin->enable_count > 1) { pin->enable_count--; return 0; } /* Disable GPIO if not used */ if (pin->enable_count <= 1) { gpiod_set_value_cansleep(pin->gpiod, 0); pin->enable_count = 0; } } return 0; } /** * _regulator_check_status_enabled - check if regulator status can be * interpreted as "regulator is enabled" * @rdev: the regulator device to check * * Return: * * 1 - if status shows regulator is in enabled state * * 0 - if not enabled state * * Error Value - as received from ops->get_status() */ static inline int _regulator_check_status_enabled(struct regulator_dev *rdev) { int ret = rdev->desc->ops->get_status(rdev); if (ret < 0) { rdev_info(rdev, "get_status returned error: %d\n", ret); return ret; } switch (ret) { case REGULATOR_STATUS_OFF: case REGULATOR_STATUS_ERROR: case REGULATOR_STATUS_UNDEFINED: return 0; default: return 1; } } static int _regulator_do_enable(struct regulator_dev *rdev) { int ret, delay; /* Query before enabling in case configuration dependent. */ ret = _regulator_get_enable_time(rdev); if (ret >= 0) { delay = ret; } else { rdev_warn(rdev, "enable_time() failed: %pe\n", ERR_PTR(ret)); delay = 0; } trace_regulator_enable(rdev_get_name(rdev)); if (rdev->desc->off_on_delay) { /* if needed, keep a distance of off_on_delay from last time * this regulator was disabled. */ ktime_t end = ktime_add_us(rdev->last_off, rdev->desc->off_on_delay); s64 remaining = ktime_us_delta(end, ktime_get_boottime()); if (remaining > 0) fsleep(remaining); } if (rdev->ena_pin) { if (!rdev->ena_gpio_state) { ret = regulator_ena_gpio_ctrl(rdev, true); if (ret < 0) return ret; rdev->ena_gpio_state = 1; } } else if (rdev->desc->ops->enable) { ret = rdev->desc->ops->enable(rdev); if (ret < 0) return ret; } else { return -EINVAL; } /* Allow the regulator to ramp; it would be useful to extend * this for bulk operations so that the regulators can ramp * together. */ trace_regulator_enable_delay(rdev_get_name(rdev)); /* If poll_enabled_time is set, poll upto the delay calculated * above, delaying poll_enabled_time uS to check if the regulator * actually got enabled. * If the regulator isn't enabled after our delay helper has expired, * return -ETIMEDOUT. */ if (rdev->desc->poll_enabled_time) { int time_remaining = delay; while (time_remaining > 0) { fsleep(rdev->desc->poll_enabled_time); if (rdev->desc->ops->get_status) { ret = _regulator_check_status_enabled(rdev); if (ret < 0) return ret; else if (ret) break; } else if (rdev->desc->ops->is_enabled(rdev)) break; time_remaining -= rdev->desc->poll_enabled_time; } if (time_remaining <= 0) { rdev_err(rdev, "Enabled check timed out\n"); return -ETIMEDOUT; } } else { fsleep(delay); } trace_regulator_enable_complete(rdev_get_name(rdev)); return 0; } /** * _regulator_handle_consumer_enable - handle that a consumer enabled * @regulator: regulator source * * Some things on a regulator consumer (like the contribution towards total * load on the regulator) only have an effect when the consumer wants the * regulator enabled. Explained in example with two consumers of the same * regulator: * consumer A: set_load(100); => total load = 0 * consumer A: regulator_enable(); => total load = 100 * consumer B: set_load(1000); => total load = 100 * consumer B: regulator_enable(); => total load = 1100 * consumer A: regulator_disable(); => total_load = 1000 * * This function (together with _regulator_handle_consumer_disable) is * responsible for keeping track of the refcount for a given regulator consumer * and applying / unapplying these things. * * Return: 0 on success or negative error number on failure. */ static int _regulator_handle_consumer_enable(struct regulator *regulator) { int ret; struct regulator_dev *rdev = regulator->rdev; lockdep_assert_held_once(&rdev->mutex.base); regulator->enable_count++; if (regulator->uA_load && regulator->enable_count == 1) { ret = drms_uA_update(rdev); if (ret) regulator->enable_count--; return ret; } return 0; } /** * _regulator_handle_consumer_disable - handle that a consumer disabled * @regulator: regulator source * * The opposite of _regulator_handle_consumer_enable(). * * Return: 0 on success or a negative error number on failure. */ static int _regulator_handle_consumer_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; lockdep_assert_held_once(&rdev->mutex.base); if (!regulator->enable_count) { rdev_err(rdev, "Underflow of regulator enable count\n"); return -EINVAL; } regulator->enable_count--; if (regulator->uA_load && regulator->enable_count == 0) return drms_uA_update(rdev); return 0; } /* locks held by regulator_enable() */ static int _regulator_enable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; int ret; lockdep_assert_held_once(&rdev->mutex.base); if (rdev->use_count == 0 && rdev->supply) { ret = _regulator_enable(rdev->supply); if (ret < 0) return ret; } /* balance only if there are regulators coupled */ if (rdev->coupling_desc.n_coupled > 1) { ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON); if (ret < 0) goto err_disable_supply; } ret = _regulator_handle_consumer_enable(regulator); if (ret < 0) goto err_disable_supply; if (rdev->use_count == 0) { /* * The regulator may already be enabled if it's not switchable * or was left on */ ret = _regulator_is_enabled(rdev); if (ret == -EINVAL || ret == 0) { if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS)) { ret = -EPERM; goto err_consumer_disable; } ret = _regulator_do_enable(rdev); if (ret < 0) goto err_consumer_disable; _notifier_call_chain(rdev, REGULATOR_EVENT_ENABLE, NULL); } else if (ret < 0) { rdev_err(rdev, "is_enabled() failed: %pe\n", ERR_PTR(ret)); goto err_consumer_disable; } /* Fallthrough on positive return values - already enabled */ } if (regulator->enable_count == 1) rdev->use_count++; return 0; err_consumer_disable: _regulator_handle_consumer_disable(regulator); err_disable_supply: if (rdev->use_count == 0 && rdev->supply) _regulator_disable(rdev->supply); return ret; } /** * regulator_enable - enable regulator output * @regulator: regulator source * * Request that the regulator be enabled with the regulator output at * the predefined voltage or current value. Calls to regulator_enable() * must be balanced with calls to regulator_disable(). * * NOTE: the output value can be set by other drivers, boot loader or may be * hardwired in the regulator. * * Return: 0 on success or a negative error number on failure. */ int regulator_enable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(rdev, &ww_ctx); ret = _regulator_enable(regulator); regulator_unlock_dependent(rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_enable); static int _regulator_do_disable(struct regulator_dev *rdev) { int ret; trace_regulator_disable(rdev_get_name(rdev)); if (rdev->ena_pin) { if (rdev->ena_gpio_state) { ret = regulator_ena_gpio_ctrl(rdev, false); if (ret < 0) return ret; rdev->ena_gpio_state = 0; } } else if (rdev->desc->ops->disable) { ret = rdev->desc->ops->disable(rdev); if (ret != 0) return ret; } if (rdev->desc->off_on_delay) rdev->last_off = ktime_get_boottime(); trace_regulator_disable_complete(rdev_get_name(rdev)); return 0; } /* locks held by regulator_disable() */ static int _regulator_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; int ret = 0; lockdep_assert_held_once(&rdev->mutex.base); if (WARN(regulator->enable_count == 0, "unbalanced disables for %s\n", rdev_get_name(rdev))) return -EIO; if (regulator->enable_count == 1) { /* disabling last enable_count from this regulator */ /* are we the last user and permitted to disable ? */ if (rdev->use_count == 1 && (rdev->constraints && !rdev->constraints->always_on)) { /* we are last user */ if (regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS)) { ret = _notifier_call_chain(rdev, REGULATOR_EVENT_PRE_DISABLE, NULL); if (ret & NOTIFY_STOP_MASK) return -EINVAL; ret = _regulator_do_disable(rdev); if (ret < 0) { rdev_err(rdev, "failed to disable: %pe\n", ERR_PTR(ret)); _notifier_call_chain(rdev, REGULATOR_EVENT_ABORT_DISABLE, NULL); return ret; } _notifier_call_chain(rdev, REGULATOR_EVENT_DISABLE, NULL); } rdev->use_count = 0; } else if (rdev->use_count > 1) { rdev->use_count--; } } if (ret == 0) ret = _regulator_handle_consumer_disable(regulator); if (ret == 0 && rdev->coupling_desc.n_coupled > 1) ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON); if (ret == 0 && rdev->use_count == 0 && rdev->supply) ret = _regulator_disable(rdev->supply); return ret; } /** * regulator_disable - disable regulator output * @regulator: regulator source * * Disable the regulator output voltage or current. Calls to * regulator_enable() must be balanced with calls to * regulator_disable(). * * NOTE: this will only disable the regulator output if no other consumer * devices have it enabled, the regulator device supports disabling and * machine constraints permit this operation. * * Return: 0 on success or a negative error number on failure. */ int regulator_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(rdev, &ww_ctx); ret = _regulator_disable(regulator); regulator_unlock_dependent(rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_disable); /* locks held by regulator_force_disable() */ static int _regulator_force_disable(struct regulator_dev *rdev) { int ret = 0; lockdep_assert_held_once(&rdev->mutex.base); ret = _notifier_call_chain(rdev, REGULATOR_EVENT_FORCE_DISABLE | REGULATOR_EVENT_PRE_DISABLE, NULL); if (ret & NOTIFY_STOP_MASK) return -EINVAL; ret = _regulator_do_disable(rdev); if (ret < 0) { rdev_err(rdev, "failed to force disable: %pe\n", ERR_PTR(ret)); _notifier_call_chain(rdev, REGULATOR_EVENT_FORCE_DISABLE | REGULATOR_EVENT_ABORT_DISABLE, NULL); return ret; } _notifier_call_chain(rdev, REGULATOR_EVENT_FORCE_DISABLE | REGULATOR_EVENT_DISABLE, NULL); return 0; } /** * regulator_force_disable - force disable regulator output * @regulator: regulator source * * Forcibly disable the regulator output voltage or current. * NOTE: this *will* disable the regulator output even if other consumer * devices have it enabled. This should be used for situations when device * damage will likely occur if the regulator is not disabled (e.g. over temp). * * Return: 0 on success or a negative error number on failure. */ int regulator_force_disable(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(rdev, &ww_ctx); ret = _regulator_force_disable(regulator->rdev); if (rdev->coupling_desc.n_coupled > 1) regulator_balance_voltage(rdev, PM_SUSPEND_ON); if (regulator->uA_load) { regulator->uA_load = 0; ret = drms_uA_update(rdev); } if (rdev->use_count != 0 && rdev->supply) _regulator_disable(rdev->supply); regulator_unlock_dependent(rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_force_disable); static void regulator_disable_work(struct work_struct *work) { struct regulator_dev *rdev = container_of(work, struct regulator_dev, disable_work.work); struct ww_acquire_ctx ww_ctx; int count, i, ret; struct regulator *regulator; int total_count = 0; regulator_lock_dependent(rdev, &ww_ctx); /* * Workqueue functions queue the new work instance while the previous * work instance is being processed. Cancel the queued work instance * as the work instance under processing does the job of the queued * work instance. */ cancel_delayed_work(&rdev->disable_work); list_for_each_entry(regulator, &rdev->consumer_list, list) { count = regulator->deferred_disables; if (!count) continue; total_count += count; regulator->deferred_disables = 0; for (i = 0; i < count; i++) { ret = _regulator_disable(regulator); if (ret != 0) rdev_err(rdev, "Deferred disable failed: %pe\n", ERR_PTR(ret)); } } WARN_ON(!total_count); if (rdev->coupling_desc.n_coupled > 1) regulator_balance_voltage(rdev, PM_SUSPEND_ON); regulator_unlock_dependent(rdev, &ww_ctx); } /** * regulator_disable_deferred - disable regulator output with delay * @regulator: regulator source * @ms: milliseconds until the regulator is disabled * * Execute regulator_disable() on the regulator after a delay. This * is intended for use with devices that require some time to quiesce. * * NOTE: this will only disable the regulator output if no other consumer * devices have it enabled, the regulator device supports disabling and * machine constraints permit this operation. * * Return: 0 on success or a negative error number on failure. */ int regulator_disable_deferred(struct regulator *regulator, int ms) { struct regulator_dev *rdev = regulator->rdev; if (!ms) return regulator_disable(regulator); regulator_lock(rdev); regulator->deferred_disables++; mod_delayed_work(system_power_efficient_wq, &rdev->disable_work, msecs_to_jiffies(ms)); regulator_unlock(rdev); return 0; } EXPORT_SYMBOL_GPL(regulator_disable_deferred); static int _regulator_is_enabled(struct regulator_dev *rdev) { /* A GPIO control always takes precedence */ if (rdev->ena_pin) return rdev->ena_gpio_state; /* If we don't know then assume that the regulator is always on */ if (!rdev->desc->ops->is_enabled) return 1; return rdev->desc->ops->is_enabled(rdev); } static int _regulator_list_voltage(struct regulator_dev *rdev, unsigned selector, int lock) { const struct regulator_ops *ops = rdev->desc->ops; int ret; if (rdev->desc->fixed_uV && rdev->desc->n_voltages == 1 && !selector) return rdev->desc->fixed_uV; if (ops->list_voltage) { if (selector >= rdev->desc->n_voltages) return -EINVAL; if (selector < rdev->desc->linear_min_sel) return 0; if (lock) regulator_lock(rdev); ret = ops->list_voltage(rdev, selector); if (lock) regulator_unlock(rdev); } else if (rdev->is_switch && rdev->supply) { ret = _regulator_list_voltage(rdev->supply->rdev, selector, lock); } else { return -EINVAL; } if (ret > 0) { if (ret < rdev->constraints->min_uV) ret = 0; else if (ret > rdev->constraints->max_uV) ret = 0; } return ret; } /** * regulator_is_enabled - is the regulator output enabled * @regulator: regulator source * * Note that the device backing this regulator handle can have multiple * users, so it might be enabled even if regulator_enable() was never * called for this particular source. * * Return: Positive if the regulator driver backing the source/client * has requested that the device be enabled, zero if it hasn't, * else a negative error number. */ int regulator_is_enabled(struct regulator *regulator) { int ret; if (regulator->always_on) return 1; regulator_lock(regulator->rdev); ret = _regulator_is_enabled(regulator->rdev); regulator_unlock(regulator->rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_is_enabled); /** * regulator_count_voltages - count regulator_list_voltage() selectors * @regulator: regulator source * * Return: Number of selectors for @regulator, or negative error number. * * Selectors are numbered starting at zero, and typically correspond to * bitfields in hardware registers. */ int regulator_count_voltages(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; if (rdev->desc->n_voltages) return rdev->desc->n_voltages; if (!rdev->is_switch || !rdev->supply) return -EINVAL; return regulator_count_voltages(rdev->supply); } EXPORT_SYMBOL_GPL(regulator_count_voltages); /** * regulator_list_voltage - enumerate supported voltages * @regulator: regulator source * @selector: identify voltage to list * Context: can sleep * * Return: Voltage for @selector that can be passed to regulator_set_voltage(), * 0 if @selector can't be used on this system, or a negative error * number on failure. */ int regulator_list_voltage(struct regulator *regulator, unsigned selector) { return _regulator_list_voltage(regulator->rdev, selector, 1); } EXPORT_SYMBOL_GPL(regulator_list_voltage); /** * regulator_get_regmap - get the regulator's register map * @regulator: regulator source * * Return: Pointer to the &struct regmap for @regulator, or ERR_PTR() * encoded -%EOPNOTSUPP if @regulator doesn't use regmap. */ struct regmap *regulator_get_regmap(struct regulator *regulator) { struct regmap *map = regulator->rdev->regmap; return map ? map : ERR_PTR(-EOPNOTSUPP); } EXPORT_SYMBOL_GPL(regulator_get_regmap); /** * regulator_get_hardware_vsel_register - get the HW voltage selector register * @regulator: regulator source * @vsel_reg: voltage selector register, output parameter * @vsel_mask: mask for voltage selector bitfield, output parameter * * Returns the hardware register offset and bitmask used for setting the * regulator voltage. This might be useful when configuring voltage-scaling * hardware or firmware that can make I2C requests behind the kernel's back, * for example. * * Return: 0 on success, or -%EOPNOTSUPP if the regulator does not support * voltage selectors. * * On success, the output parameters @vsel_reg and @vsel_mask are filled in * and 0 is returned, otherwise a negative error number is returned. */ int regulator_get_hardware_vsel_register(struct regulator *regulator, unsigned *vsel_reg, unsigned *vsel_mask) { struct regulator_dev *rdev = regulator->rdev; const struct regulator_ops *ops = rdev->desc->ops; if (ops->set_voltage_sel != regulator_set_voltage_sel_regmap) return -EOPNOTSUPP; *vsel_reg = rdev->desc->vsel_reg; *vsel_mask = rdev->desc->vsel_mask; return 0; } EXPORT_SYMBOL_GPL(regulator_get_hardware_vsel_register); /** * regulator_list_hardware_vsel - get the HW-specific register value for a selector * @regulator: regulator source * @selector: identify voltage to list * * Converts the selector to a hardware-specific voltage selector that can be * directly written to the regulator registers. The address of the voltage * register can be determined by calling @regulator_get_hardware_vsel_register. * * Return: 0 on success, -%EINVAL if the selector is outside the supported * range, or -%EOPNOTSUPP if the regulator does not support voltage * selectors. */ int regulator_list_hardware_vsel(struct regulator *regulator, unsigned selector) { struct regulator_dev *rdev = regulator->rdev; const struct regulator_ops *ops = rdev->desc->ops; if (selector >= rdev->desc->n_voltages) return -EINVAL; if (selector < rdev->desc->linear_min_sel) return 0; if (ops->set_voltage_sel != regulator_set_voltage_sel_regmap) return -EOPNOTSUPP; return selector; } EXPORT_SYMBOL_GPL(regulator_list_hardware_vsel); /** * regulator_hardware_enable - access the HW for enable/disable regulator * @regulator: regulator source * @enable: true for enable, false for disable * * Request that the regulator be enabled/disabled with the regulator output at * the predefined voltage or current value. * * Return: 0 on success or a negative error number on failure. */ int regulator_hardware_enable(struct regulator *regulator, bool enable) { struct regulator_dev *rdev = regulator->rdev; const struct regulator_ops *ops = rdev->desc->ops; int ret = -EOPNOTSUPP; if (!rdev->exclusive || !ops || !ops->enable || !ops->disable) return ret; if (enable) ret = ops->enable(rdev); else ret = ops->disable(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_hardware_enable); /** * regulator_get_linear_step - return the voltage step size between VSEL values * @regulator: regulator source * * Return: The voltage step size between VSEL values for linear regulators, * or 0 if the regulator isn't a linear regulator. */ unsigned int regulator_get_linear_step(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; return rdev->desc->uV_step; } EXPORT_SYMBOL_GPL(regulator_get_linear_step); /** * regulator_is_supported_voltage - check if a voltage range can be supported * * @regulator: Regulator to check. * @min_uV: Minimum required voltage in uV. * @max_uV: Maximum required voltage in uV. * * Return: 1 if the voltage range is supported, 0 if not, or a negative error * number if @regulator's voltage can't be changed and voltage readback * failed. */ int regulator_is_supported_voltage(struct regulator *regulator, int min_uV, int max_uV) { struct regulator_dev *rdev = regulator->rdev; int i, voltages, ret; /* If we can't change voltage check the current voltage */ if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) { ret = regulator_get_voltage(regulator); if (ret >= 0) return min_uV <= ret && ret <= max_uV; else return ret; } /* Any voltage within constrains range is fine? */ if (rdev->desc->continuous_voltage_range) return min_uV >= rdev->constraints->min_uV && max_uV <= rdev->constraints->max_uV; ret = regulator_count_voltages(regulator); if (ret < 0) return 0; voltages = ret; for (i = 0; i < voltages; i++) { ret = regulator_list_voltage(regulator, i); if (ret >= min_uV && ret <= max_uV) return 1; } return 0; } EXPORT_SYMBOL_GPL(regulator_is_supported_voltage); static int regulator_map_voltage(struct regulator_dev *rdev, int min_uV, int max_uV) { const struct regulator_desc *desc = rdev->desc; if (desc->ops->map_voltage) return desc->ops->map_voltage(rdev, min_uV, max_uV); if (desc->ops->list_voltage == regulator_list_voltage_linear) return regulator_map_voltage_linear(rdev, min_uV, max_uV); if (desc->ops->list_voltage == regulator_list_voltage_linear_range) return regulator_map_voltage_linear_range(rdev, min_uV, max_uV); if (desc->ops->list_voltage == regulator_list_voltage_pickable_linear_range) return regulator_map_voltage_pickable_linear_range(rdev, min_uV, max_uV); return regulator_map_voltage_iterate(rdev, min_uV, max_uV); } static int _regulator_call_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV, unsigned *selector) { struct pre_voltage_change_data data; int ret; data.old_uV = regulator_get_voltage_rdev(rdev); data.min_uV = min_uV; data.max_uV = max_uV; ret = _notifier_call_chain(rdev, REGULATOR_EVENT_PRE_VOLTAGE_CHANGE, &data); if (ret & NOTIFY_STOP_MASK) return -EINVAL; ret = rdev->desc->ops->set_voltage(rdev, min_uV, max_uV, selector); if (ret >= 0) return ret; _notifier_call_chain(rdev, REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE, (void *)data.old_uV); return ret; } static int _regulator_call_set_voltage_sel(struct regulator_dev *rdev, int uV, unsigned selector) { struct pre_voltage_change_data data; int ret; data.old_uV = regulator_get_voltage_rdev(rdev); data.min_uV = uV; data.max_uV = uV; ret = _notifier_call_chain(rdev, REGULATOR_EVENT_PRE_VOLTAGE_CHANGE, &data); if (ret & NOTIFY_STOP_MASK) return -EINVAL; ret = rdev->desc->ops->set_voltage_sel(rdev, selector); if (ret >= 0) return ret; _notifier_call_chain(rdev, REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE, (void *)data.old_uV); return ret; } static int _regulator_set_voltage_sel_step(struct regulator_dev *rdev, int uV, int new_selector) { const struct regulator_ops *ops = rdev->desc->ops; int diff, old_sel, curr_sel, ret; /* Stepping is only needed if the regulator is enabled. */ if (!_regulator_is_enabled(rdev)) goto final_set; if (!ops->get_voltage_sel) return -EINVAL; old_sel = ops->get_voltage_sel(rdev); if (old_sel < 0) return old_sel; diff = new_selector - old_sel; if (diff == 0) return 0; /* No change needed. */ if (diff > 0) { /* Stepping up. */ for (curr_sel = old_sel + rdev->desc->vsel_step; curr_sel < new_selector; curr_sel += rdev->desc->vsel_step) { /* * Call the callback directly instead of using * _regulator_call_set_voltage_sel() as we don't * want to notify anyone yet. Same in the branch * below. */ ret = ops->set_voltage_sel(rdev, curr_sel); if (ret) goto try_revert; } } else { /* Stepping down. */ for (curr_sel = old_sel - rdev->desc->vsel_step; curr_sel > new_selector; curr_sel -= rdev->desc->vsel_step) { ret = ops->set_voltage_sel(rdev, curr_sel); if (ret) goto try_revert; } } final_set: /* The final selector will trigger the notifiers. */ return _regulator_call_set_voltage_sel(rdev, uV, new_selector); try_revert: /* * At least try to return to the previous voltage if setting a new * one failed. */ (void)ops->set_voltage_sel(rdev, old_sel); return ret; } static int _regulator_set_voltage_time(struct regulator_dev *rdev, int old_uV, int new_uV) { unsigned int ramp_delay = 0; if (rdev->constraints->ramp_delay) ramp_delay = rdev->constraints->ramp_delay; else if (rdev->desc->ramp_delay) ramp_delay = rdev->desc->ramp_delay; else if (rdev->constraints->settling_time) return rdev->constraints->settling_time; else if (rdev->constraints->settling_time_up && (new_uV > old_uV)) return rdev->constraints->settling_time_up; else if (rdev->constraints->settling_time_down && (new_uV < old_uV)) return rdev->constraints->settling_time_down; if (ramp_delay == 0) return 0; return DIV_ROUND_UP(abs(new_uV - old_uV), ramp_delay); } static int _regulator_do_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV) { int ret; int delay = 0; int best_val = 0; unsigned int selector; int old_selector = -1; const struct regulator_ops *ops = rdev->desc->ops; int old_uV = regulator_get_voltage_rdev(rdev); trace_regulator_set_voltage(rdev_get_name(rdev), min_uV, max_uV); min_uV += rdev->constraints->uV_offset; max_uV += rdev->constraints->uV_offset; /* * If we can't obtain the old selector there is not enough * info to call set_voltage_time_sel(). */ if (_regulator_is_enabled(rdev) && ops->set_voltage_time_sel && ops->get_voltage_sel) { old_selector = ops->get_voltage_sel(rdev); if (old_selector < 0) return old_selector; } if (ops->set_voltage) { ret = _regulator_call_set_voltage(rdev, min_uV, max_uV, &selector); if (ret >= 0) { if (ops->list_voltage) best_val = ops->list_voltage(rdev, selector); else best_val = regulator_get_voltage_rdev(rdev); } } else if (ops->set_voltage_sel) { ret = regulator_map_voltage(rdev, min_uV, max_uV); if (ret >= 0) { best_val = ops->list_voltage(rdev, ret); if (min_uV <= best_val && max_uV >= best_val) { selector = ret; if (old_selector == selector) ret = 0; else if (rdev->desc->vsel_step) ret = _regulator_set_voltage_sel_step( rdev, best_val, selector); else ret = _regulator_call_set_voltage_sel( rdev, best_val, selector); } else { ret = -EINVAL; } } } else { ret = -EINVAL; } if (ret) goto out; if (ops->set_voltage_time_sel) { /* * Call set_voltage_time_sel if successfully obtained * old_selector */ if (old_selector >= 0 && old_selector != selector) delay = ops->set_voltage_time_sel(rdev, old_selector, selector); } else { if (old_uV != best_val) { if (ops->set_voltage_time) delay = ops->set_voltage_time(rdev, old_uV, best_val); else delay = _regulator_set_voltage_time(rdev, old_uV, best_val); } } if (delay < 0) { rdev_warn(rdev, "failed to get delay: %pe\n", ERR_PTR(delay)); delay = 0; } /* Insert any necessary delays */ fsleep(delay); if (best_val >= 0) { unsigned long data = best_val; _notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, (void *)data); } out: trace_regulator_set_voltage_complete(rdev_get_name(rdev), best_val); return ret; } static int _regulator_do_set_suspend_voltage(struct regulator_dev *rdev, int min_uV, int max_uV, suspend_state_t state) { struct regulator_state *rstate; int uV, sel; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return -EINVAL; if (min_uV < rstate->min_uV) min_uV = rstate->min_uV; if (max_uV > rstate->max_uV) max_uV = rstate->max_uV; sel = regulator_map_voltage(rdev, min_uV, max_uV); if (sel < 0) return sel; uV = rdev->desc->ops->list_voltage(rdev, sel); if (uV >= min_uV && uV <= max_uV) rstate->uV = uV; return 0; } static int regulator_get_voltage_delta(struct regulator_dev *rdev, int uV) { int current_uV = regulator_get_voltage_rdev(rdev); if (current_uV < 0) return current_uV; return abs(current_uV - uV); } static int regulator_set_voltage_unlocked(struct regulator *regulator, int min_uV, int max_uV, suspend_state_t state) { struct regulator_dev *rdev = regulator->rdev; struct regulator_voltage *voltage = &regulator->voltage[state]; int ret = 0; int current_uV, delta, new_delta; int old_min_uV, old_max_uV; /* If we're setting the same range as last time the change * should be a noop (some cpufreq implementations use the same * voltage for multiple frequencies, for example). */ if (voltage->min_uV == min_uV && voltage->max_uV == max_uV) goto out; /* If we're trying to set a range that overlaps the current voltage, * return successfully even though the regulator does not support * changing the voltage. */ if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) { current_uV = regulator_get_voltage_rdev(rdev); if (min_uV <= current_uV && current_uV <= max_uV) { voltage->min_uV = min_uV; voltage->max_uV = max_uV; goto out; } } /* sanity check */ if (!rdev->desc->ops->set_voltage && !rdev->desc->ops->set_voltage_sel) { ret = -EINVAL; goto out; } /* constraints check */ ret = regulator_check_voltage(rdev, &min_uV, &max_uV); if (ret < 0) goto out; /* restore original values in case of error */ old_min_uV = voltage->min_uV; old_max_uV = voltage->max_uV; voltage->min_uV = min_uV; voltage->max_uV = max_uV; /* for not coupled regulators this will just set the voltage */ ret = regulator_balance_voltage(rdev, state); if (ret < 0) { voltage->min_uV = old_min_uV; voltage->max_uV = old_max_uV; } if (rdev->constraints->max_uV_step > 0) { /* For regulators with a maximum voltage step, reaching the desired * voltage might take a few retries. */ ret = regulator_get_voltage_delta(rdev, min_uV); if (ret < 0) goto out; delta = ret; while (delta > 0) { ret = regulator_balance_voltage(rdev, state); if (ret < 0) goto out; ret = regulator_get_voltage_delta(rdev, min_uV); if (ret < 0) goto out; new_delta = ret; /* check that voltage is converging quickly enough */ if (delta - new_delta < rdev->constraints->max_uV_step) { ret = -EWOULDBLOCK; goto out; } delta = new_delta; } } out: return ret; } int regulator_set_voltage_rdev(struct regulator_dev *rdev, int min_uV, int max_uV, suspend_state_t state) { int best_supply_uV = 0; int supply_change_uV = 0; int ret; if (rdev->supply && regulator_ops_is_valid(rdev->supply->rdev, REGULATOR_CHANGE_VOLTAGE) && (rdev->desc->min_dropout_uV || !(rdev->desc->ops->get_voltage || rdev->desc->ops->get_voltage_sel))) { int current_supply_uV; int selector; selector = regulator_map_voltage(rdev, min_uV, max_uV); if (selector < 0) { ret = selector; goto out; } best_supply_uV = _regulator_list_voltage(rdev, selector, 0); if (best_supply_uV < 0) { ret = best_supply_uV; goto out; } best_supply_uV += rdev->desc->min_dropout_uV; current_supply_uV = regulator_get_voltage_rdev(rdev->supply->rdev); if (current_supply_uV < 0) { ret = current_supply_uV; goto out; } supply_change_uV = best_supply_uV - current_supply_uV; } if (supply_change_uV > 0) { ret = regulator_set_voltage_unlocked(rdev->supply, best_supply_uV, INT_MAX, state); if (ret) { dev_err(&rdev->dev, "Failed to increase supply voltage: %pe\n", ERR_PTR(ret)); goto out; } } if (state == PM_SUSPEND_ON) ret = _regulator_do_set_voltage(rdev, min_uV, max_uV); else ret = _regulator_do_set_suspend_voltage(rdev, min_uV, max_uV, state); if (ret < 0) goto out; if (supply_change_uV < 0) { ret = regulator_set_voltage_unlocked(rdev->supply, best_supply_uV, INT_MAX, state); if (ret) dev_warn(&rdev->dev, "Failed to decrease supply voltage: %pe\n", ERR_PTR(ret)); /* No need to fail here */ ret = 0; } out: return ret; } EXPORT_SYMBOL_GPL(regulator_set_voltage_rdev); static int regulator_limit_voltage_step(struct regulator_dev *rdev, int *current_uV, int *min_uV) { struct regulation_constraints *constraints = rdev->constraints; /* Limit voltage change only if necessary */ if (!constraints->max_uV_step || !_regulator_is_enabled(rdev)) return 1; if (*current_uV < 0) { *current_uV = regulator_get_voltage_rdev(rdev); if (*current_uV < 0) return *current_uV; } if (abs(*current_uV - *min_uV) <= constraints->max_uV_step) return 1; /* Clamp target voltage within the given step */ if (*current_uV < *min_uV) *min_uV = min(*current_uV + constraints->max_uV_step, *min_uV); else *min_uV = max(*current_uV - constraints->max_uV_step, *min_uV); return 0; } static int regulator_get_optimal_voltage(struct regulator_dev *rdev, int *current_uV, int *min_uV, int *max_uV, suspend_state_t state, int n_coupled) { struct coupling_desc *c_desc = &rdev->coupling_desc; struct regulator_dev **c_rdevs = c_desc->coupled_rdevs; struct regulation_constraints *constraints = rdev->constraints; int desired_min_uV = 0, desired_max_uV = INT_MAX; int max_current_uV = 0, min_current_uV = INT_MAX; int highest_min_uV = 0, target_uV, possible_uV; int i, ret, max_spread; bool done; *current_uV = -1; /* * If there are no coupled regulators, simply set the voltage * demanded by consumers. */ if (n_coupled == 1) { /* * If consumers don't provide any demands, set voltage * to min_uV */ desired_min_uV = constraints->min_uV; desired_max_uV = constraints->max_uV; ret = regulator_check_consumers(rdev, &desired_min_uV, &desired_max_uV, state); if (ret < 0) return ret; done = true; goto finish; } /* Find highest min desired voltage */ for (i = 0; i < n_coupled; i++) { int tmp_min = 0; int tmp_max = INT_MAX; lockdep_assert_held_once(&c_rdevs[i]->mutex.base); ret = regulator_check_consumers(c_rdevs[i], &tmp_min, &tmp_max, state); if (ret < 0) return ret; ret = regulator_check_voltage(c_rdevs[i], &tmp_min, &tmp_max); if (ret < 0) return ret; highest_min_uV = max(highest_min_uV, tmp_min); if (i == 0) { desired_min_uV = tmp_min; desired_max_uV = tmp_max; } } max_spread = constraints->max_spread[0]; /* * Let target_uV be equal to the desired one if possible. * If not, set it to minimum voltage, allowed by other coupled * regulators. */ target_uV = max(desired_min_uV, highest_min_uV - max_spread); /* * Find min and max voltages, which currently aren't violating * max_spread. */ for (i = 1; i < n_coupled; i++) { int tmp_act; if (!_regulator_is_enabled(c_rdevs[i])) continue; tmp_act = regulator_get_voltage_rdev(c_rdevs[i]); if (tmp_act < 0) return tmp_act; min_current_uV = min(tmp_act, min_current_uV); max_current_uV = max(tmp_act, max_current_uV); } /* There aren't any other regulators enabled */ if (max_current_uV == 0) { possible_uV = target_uV; } else { /* * Correct target voltage, so as it currently isn't * violating max_spread */ possible_uV = max(target_uV, max_current_uV - max_spread); possible_uV = min(possible_uV, min_current_uV + max_spread); } if (possible_uV > desired_max_uV) return -EINVAL; done = (possible_uV == target_uV); desired_min_uV = possible_uV; finish: /* Apply max_uV_step constraint if necessary */ if (state == PM_SUSPEND_ON) { ret = regulator_limit_voltage_step(rdev, current_uV, &desired_min_uV); if (ret < 0) return ret; if (ret == 0) done = false; } /* Set current_uV if wasn't done earlier in the code and if necessary */ if (n_coupled > 1 && *current_uV == -1) { if (_regulator_is_enabled(rdev)) { ret = regulator_get_voltage_rdev(rdev); if (ret < 0) return ret; *current_uV = ret; } else { *current_uV = desired_min_uV; } } *min_uV = desired_min_uV; *max_uV = desired_max_uV; return done; } int regulator_do_balance_voltage(struct regulator_dev *rdev, suspend_state_t state, bool skip_coupled) { struct regulator_dev **c_rdevs; struct regulator_dev *best_rdev; struct coupling_desc *c_desc = &rdev->coupling_desc; int i, ret, n_coupled, best_min_uV, best_max_uV, best_c_rdev; unsigned int delta, best_delta; unsigned long c_rdev_done = 0; bool best_c_rdev_done; c_rdevs = c_desc->coupled_rdevs; n_coupled = skip_coupled ? 1 : c_desc->n_coupled; /* * Find the best possible voltage change on each loop. Leave the loop * if there isn't any possible change. */ do { best_c_rdev_done = false; best_delta = 0; best_min_uV = 0; best_max_uV = 0; best_c_rdev = 0; best_rdev = NULL; /* * Find highest difference between optimal voltage * and current voltage. */ for (i = 0; i < n_coupled; i++) { /* * optimal_uV is the best voltage that can be set for * i-th regulator at the moment without violating * max_spread constraint in order to balance * the coupled voltages. */ int optimal_uV = 0, optimal_max_uV = 0, current_uV = 0; if (test_bit(i, &c_rdev_done)) continue; ret = regulator_get_optimal_voltage(c_rdevs[i], &current_uV, &optimal_uV, &optimal_max_uV, state, n_coupled); if (ret < 0) goto out; delta = abs(optimal_uV - current_uV); if (delta && best_delta <= delta) { best_c_rdev_done = ret; best_delta = delta; best_rdev = c_rdevs[i]; best_min_uV = optimal_uV; best_max_uV = optimal_max_uV; best_c_rdev = i; } } /* Nothing to change, return successfully */ if (!best_rdev) { ret = 0; goto out; } ret = regulator_set_voltage_rdev(best_rdev, best_min_uV, best_max_uV, state); if (ret < 0) goto out; if (best_c_rdev_done) set_bit(best_c_rdev, &c_rdev_done); } while (n_coupled > 1); out: return ret; } static int regulator_balance_voltage(struct regulator_dev *rdev, suspend_state_t state) { struct coupling_desc *c_desc = &rdev->coupling_desc; struct regulator_coupler *coupler = c_desc->coupler; bool skip_coupled = false; /* * If system is in a state other than PM_SUSPEND_ON, don't check * other coupled regulators. */ if (state != PM_SUSPEND_ON) skip_coupled = true; if (c_desc->n_resolved < c_desc->n_coupled) { rdev_err(rdev, "Not all coupled regulators registered\n"); return -EPERM; } /* Invoke custom balancer for customized couplers */ if (coupler && coupler->balance_voltage) return coupler->balance_voltage(coupler, rdev, state); return regulator_do_balance_voltage(rdev, state, skip_coupled); } /** * regulator_set_voltage - set regulator output voltage * @regulator: regulator source * @min_uV: Minimum required voltage in uV * @max_uV: Maximum acceptable voltage in uV * * Sets a voltage regulator to the desired output voltage. This can be set * during any regulator state. IOW, regulator can be disabled or enabled. * * If the regulator is enabled then the voltage will change to the new value * immediately otherwise if the regulator is disabled the regulator will * output at the new voltage when enabled. * * NOTE: If the regulator is shared between several devices then the lowest * request voltage that meets the system constraints will be used. * Regulator system constraints must be set for this regulator before * calling this function otherwise this call will fail. * * Return: 0 on success or a negative error number on failure. */ int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV) { struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(regulator->rdev, &ww_ctx); ret = regulator_set_voltage_unlocked(regulator, min_uV, max_uV, PM_SUSPEND_ON); regulator_unlock_dependent(regulator->rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_set_voltage); static inline int regulator_suspend_toggle(struct regulator_dev *rdev, suspend_state_t state, bool en) { struct regulator_state *rstate; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return -EINVAL; if (!rstate->changeable) return -EPERM; rstate->enabled = (en) ? ENABLE_IN_SUSPEND : DISABLE_IN_SUSPEND; return 0; } int regulator_suspend_enable(struct regulator_dev *rdev, suspend_state_t state) { return regulator_suspend_toggle(rdev, state, true); } EXPORT_SYMBOL_GPL(regulator_suspend_enable); int regulator_suspend_disable(struct regulator_dev *rdev, suspend_state_t state) { struct regulator *regulator; struct regulator_voltage *voltage; /* * if any consumer wants this regulator device keeping on in * suspend states, don't set it as disabled. */ list_for_each_entry(regulator, &rdev->consumer_list, list) { voltage = &regulator->voltage[state]; if (voltage->min_uV || voltage->max_uV) return 0; } return regulator_suspend_toggle(rdev, state, false); } EXPORT_SYMBOL_GPL(regulator_suspend_disable); static int _regulator_set_suspend_voltage(struct regulator *regulator, int min_uV, int max_uV, suspend_state_t state) { struct regulator_dev *rdev = regulator->rdev; struct regulator_state *rstate; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return -EINVAL; if (rstate->min_uV == rstate->max_uV) { rdev_err(rdev, "The suspend voltage can't be changed!\n"); return -EPERM; } return regulator_set_voltage_unlocked(regulator, min_uV, max_uV, state); } int regulator_set_suspend_voltage(struct regulator *regulator, int min_uV, int max_uV, suspend_state_t state) { struct ww_acquire_ctx ww_ctx; int ret; /* PM_SUSPEND_ON is handled by regulator_set_voltage() */ if (regulator_check_states(state) || state == PM_SUSPEND_ON) return -EINVAL; regulator_lock_dependent(regulator->rdev, &ww_ctx); ret = _regulator_set_suspend_voltage(regulator, min_uV, max_uV, state); regulator_unlock_dependent(regulator->rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_set_suspend_voltage); /** * regulator_set_voltage_time - get raise/fall time * @regulator: regulator source * @old_uV: starting voltage in microvolts * @new_uV: target voltage in microvolts * * Provided with the starting and ending voltage, this function attempts to * calculate the time in microseconds required to rise or fall to this new * voltage. * * Return: ramp time in microseconds, or a negative error number if calculation failed. */ int regulator_set_voltage_time(struct regulator *regulator, int old_uV, int new_uV) { struct regulator_dev *rdev = regulator->rdev; const struct regulator_ops *ops = rdev->desc->ops; int old_sel = -1; int new_sel = -1; int voltage; int i; if (ops->set_voltage_time) return ops->set_voltage_time(rdev, old_uV, new_uV); else if (!ops->set_voltage_time_sel) return _regulator_set_voltage_time(rdev, old_uV, new_uV); /* Currently requires operations to do this */ if (!ops->list_voltage || !rdev->desc->n_voltages) return -EINVAL; for (i = 0; i < rdev->desc->n_voltages; i++) { /* We only look for exact voltage matches here */ if (i < rdev->desc->linear_min_sel) continue; if (old_sel >= 0 && new_sel >= 0) break; voltage = regulator_list_voltage(regulator, i); if (voltage < 0) return -EINVAL; if (voltage == 0) continue; if (voltage == old_uV) old_sel = i; if (voltage == new_uV) new_sel = i; } if (old_sel < 0 || new_sel < 0) return -EINVAL; return ops->set_voltage_time_sel(rdev, old_sel, new_sel); } EXPORT_SYMBOL_GPL(regulator_set_voltage_time); /** * regulator_set_voltage_time_sel - get raise/fall time * @rdev: regulator source device * @old_selector: selector for starting voltage * @new_selector: selector for target voltage * * Provided with the starting and target voltage selectors, this function * returns time in microseconds required to rise or fall to this new voltage * * Drivers providing ramp_delay in regulation_constraints can use this as their * set_voltage_time_sel() operation. * * Return: ramp time in microseconds, or a negative error number if calculation failed. */ int regulator_set_voltage_time_sel(struct regulator_dev *rdev, unsigned int old_selector, unsigned int new_selector) { int old_volt, new_volt; /* sanity check */ if (!rdev->desc->ops->list_voltage) return -EINVAL; old_volt = rdev->desc->ops->list_voltage(rdev, old_selector); new_volt = rdev->desc->ops->list_voltage(rdev, new_selector); if (rdev->desc->ops->set_voltage_time) return rdev->desc->ops->set_voltage_time(rdev, old_volt, new_volt); else return _regulator_set_voltage_time(rdev, old_volt, new_volt); } EXPORT_SYMBOL_GPL(regulator_set_voltage_time_sel); int regulator_sync_voltage_rdev(struct regulator_dev *rdev) { int ret; regulator_lock(rdev); if (!rdev->desc->ops->set_voltage && !rdev->desc->ops->set_voltage_sel) { ret = -EINVAL; goto out; } /* balance only, if regulator is coupled */ if (rdev->coupling_desc.n_coupled > 1) ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON); else ret = -EOPNOTSUPP; out: regulator_unlock(rdev); return ret; } /** * regulator_sync_voltage - re-apply last regulator output voltage * @regulator: regulator source * * Re-apply the last configured voltage. This is intended to be used * where some external control source the consumer is cooperating with * has caused the configured voltage to change. * * Return: 0 on success or a negative error number on failure. */ int regulator_sync_voltage(struct regulator *regulator) { struct regulator_dev *rdev = regulator->rdev; struct regulator_voltage *voltage = &regulator->voltage[PM_SUSPEND_ON]; int ret, min_uV, max_uV; if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_VOLTAGE)) return 0; regulator_lock(rdev); if (!rdev->desc->ops->set_voltage && !rdev->desc->ops->set_voltage_sel) { ret = -EINVAL; goto out; } /* This is only going to work if we've had a voltage configured. */ if (!voltage->min_uV && !voltage->max_uV) { ret = -EINVAL; goto out; } min_uV = voltage->min_uV; max_uV = voltage->max_uV; /* This should be a paranoia check... */ ret = regulator_check_voltage(rdev, &min_uV, &max_uV); if (ret < 0) goto out; ret = regulator_check_consumers(rdev, &min_uV, &max_uV, 0); if (ret < 0) goto out; /* balance only, if regulator is coupled */ if (rdev->coupling_desc.n_coupled > 1) ret = regulator_balance_voltage(rdev, PM_SUSPEND_ON); else ret = _regulator_do_set_voltage(rdev, min_uV, max_uV); out: regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_sync_voltage); int regulator_get_voltage_rdev(struct regulator_dev *rdev) { int sel, ret; bool bypassed; if (rdev->desc->ops->get_bypass) { ret = rdev->desc->ops->get_bypass(rdev, &bypassed); if (ret < 0) return ret; if (bypassed) { /* if bypassed the regulator must have a supply */ if (!rdev->supply) { rdev_err(rdev, "bypassed regulator has no supply!\n"); return -EPROBE_DEFER; } return regulator_get_voltage_rdev(rdev->supply->rdev); } } if (rdev->desc->ops->get_voltage_sel) { sel = rdev->desc->ops->get_voltage_sel(rdev); if (sel < 0) return sel; ret = rdev->desc->ops->list_voltage(rdev, sel); } else if (rdev->desc->ops->get_voltage) { ret = rdev->desc->ops->get_voltage(rdev); } else if (rdev->desc->ops->list_voltage) { ret = rdev->desc->ops->list_voltage(rdev, 0); } else if (rdev->desc->fixed_uV && (rdev->desc->n_voltages == 1)) { ret = rdev->desc->fixed_uV; } else if (rdev->supply) { ret = regulator_get_voltage_rdev(rdev->supply->rdev); } else if (rdev->supply_name) { return -EPROBE_DEFER; } else { return -EINVAL; } if (ret < 0) return ret; return ret - rdev->constraints->uV_offset; } EXPORT_SYMBOL_GPL(regulator_get_voltage_rdev); /** * regulator_get_voltage - get regulator output voltage * @regulator: regulator source * * Return: Current regulator voltage in uV, or a negative error number on failure. * * NOTE: If the regulator is disabled it will return the voltage value. This * function should not be used to determine regulator state. */ int regulator_get_voltage(struct regulator *regulator) { struct ww_acquire_ctx ww_ctx; int ret; regulator_lock_dependent(regulator->rdev, &ww_ctx); ret = regulator_get_voltage_rdev(regulator->rdev); regulator_unlock_dependent(regulator->rdev, &ww_ctx); return ret; } EXPORT_SYMBOL_GPL(regulator_get_voltage); /** * regulator_set_current_limit - set regulator output current limit * @regulator: regulator source * @min_uA: Minimum supported current in uA * @max_uA: Maximum supported current in uA * * Sets current sink to the desired output current. This can be set during * any regulator state. IOW, regulator can be disabled or enabled. * * If the regulator is enabled then the current will change to the new value * immediately otherwise if the regulator is disabled the regulator will * output at the new current when enabled. * * NOTE: Regulator system constraints must be set for this regulator before * calling this function otherwise this call will fail. * * Return: 0 on success or a negative error number on failure. */ int regulator_set_current_limit(struct regulator *regulator, int min_uA, int max_uA) { struct regulator_dev *rdev = regulator->rdev; int ret; regulator_lock(rdev); /* sanity check */ if (!rdev->desc->ops->set_current_limit) { ret = -EINVAL; goto out; } /* constraints check */ ret = regulator_check_current_limit(rdev, &min_uA, &max_uA); if (ret < 0) goto out; ret = rdev->desc->ops->set_current_limit(rdev, min_uA, max_uA); out: regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_set_current_limit); static int _regulator_get_current_limit_unlocked(struct regulator_dev *rdev) { /* sanity check */ if (!rdev->desc->ops->get_current_limit) return -EINVAL; return rdev->desc->ops->get_current_limit(rdev); } static int _regulator_get_current_limit(struct regulator_dev *rdev) { int ret; regulator_lock(rdev); ret = _regulator_get_current_limit_unlocked(rdev); regulator_unlock(rdev); return ret; } /** * regulator_get_current_limit - get regulator output current * @regulator: regulator source * * Return: Current supplied by the specified current sink in uA, * or a negative error number on failure. * * NOTE: If the regulator is disabled it will return the current value. This * function should not be used to determine regulator state. */ int regulator_get_current_limit(struct regulator *regulator) { return _regulator_get_current_limit(regulator->rdev); } EXPORT_SYMBOL_GPL(regulator_get_current_limit); /** * regulator_get_unclaimed_power_budget - get regulator unclaimed power budget * @regulator: regulator source * * Return: Unclaimed power budget of the regulator in mW. */ int regulator_get_unclaimed_power_budget(struct regulator *regulator) { return regulator->rdev->constraints->pw_budget_mW - regulator->rdev->pw_requested_mW; } EXPORT_SYMBOL_GPL(regulator_get_unclaimed_power_budget); /** * regulator_request_power_budget - request power budget on a regulator * @regulator: regulator source * @pw_req: Power requested * * Return: 0 on success or a negative error number on failure. */ int regulator_request_power_budget(struct regulator *regulator, unsigned int pw_req) { struct regulator_dev *rdev = regulator->rdev; int ret = 0, pw_tot_req; regulator_lock(rdev); if (rdev->supply) { ret = regulator_request_power_budget(rdev->supply, pw_req); if (ret < 0) goto out; } pw_tot_req = rdev->pw_requested_mW + pw_req; if (pw_tot_req > rdev->constraints->pw_budget_mW) { rdev_warn(rdev, "power requested %d mW out of budget %d mW", pw_req, rdev->constraints->pw_budget_mW - rdev->pw_requested_mW); regulator_notifier_call_chain(rdev, REGULATOR_EVENT_OVER_CURRENT_WARN, NULL); ret = -ERANGE; goto out; } rdev->pw_requested_mW = pw_tot_req; out: regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_request_power_budget); /** * regulator_free_power_budget - free power budget on a regulator * @regulator: regulator source * @pw: Power to be released. * * Return: Power budget of the regulator in mW. */ void regulator_free_power_budget(struct regulator *regulator, unsigned int pw) { struct regulator_dev *rdev = regulator->rdev; int pw_tot_req; regulator_lock(rdev); if (rdev->supply) regulator_free_power_budget(rdev->supply, pw); pw_tot_req = rdev->pw_requested_mW - pw; if (pw_tot_req >= 0) rdev->pw_requested_mW = pw_tot_req; else rdev_warn(rdev, "too much power freed %d mW (already requested %d mW)", pw, rdev->pw_requested_mW); regulator_unlock(rdev); } EXPORT_SYMBOL_GPL(regulator_free_power_budget); /** * regulator_set_mode - set regulator operating mode * @regulator: regulator source * @mode: operating mode - one of the REGULATOR_MODE constants * * Set regulator operating mode to increase regulator efficiency or improve * regulation performance. * * NOTE: Regulator system constraints must be set for this regulator before * calling this function otherwise this call will fail. * * Return: 0 on success or a negative error number on failure. */ int regulator_set_mode(struct regulator *regulator, unsigned int mode) { struct regulator_dev *rdev = regulator->rdev; int ret; int regulator_curr_mode; regulator_lock(rdev); /* sanity check */ if (!rdev->desc->ops->set_mode) { ret = -EINVAL; goto out; } /* return if the same mode is requested */ if (rdev->desc->ops->get_mode) { regulator_curr_mode = rdev->desc->ops->get_mode(rdev); if (regulator_curr_mode == mode) { ret = 0; goto out; } } /* constraints check */ ret = regulator_mode_constrain(rdev, &mode); if (ret < 0) goto out; ret = rdev->desc->ops->set_mode(rdev, mode); out: regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_set_mode); static unsigned int _regulator_get_mode_unlocked(struct regulator_dev *rdev) { /* sanity check */ if (!rdev->desc->ops->get_mode) return -EINVAL; return rdev->desc->ops->get_mode(rdev); } static unsigned int _regulator_get_mode(struct regulator_dev *rdev) { int ret; regulator_lock(rdev); ret = _regulator_get_mode_unlocked(rdev); regulator_unlock(rdev); return ret; } /** * regulator_get_mode - get regulator operating mode * @regulator: regulator source * * Get the current regulator operating mode. * * Return: Current operating mode as %REGULATOR_MODE_* values, * or a negative error number on failure. */ unsigned int regulator_get_mode(struct regulator *regulator) { return _regulator_get_mode(regulator->rdev); } EXPORT_SYMBOL_GPL(regulator_get_mode); static int rdev_get_cached_err_flags(struct regulator_dev *rdev) { int ret = 0; if (rdev->use_cached_err) { spin_lock(&rdev->err_lock); ret = rdev->cached_err; spin_unlock(&rdev->err_lock); } return ret; } static int _regulator_get_error_flags(struct regulator_dev *rdev, unsigned int *flags) { int cached_flags, ret = 0; regulator_lock(rdev); cached_flags = rdev_get_cached_err_flags(rdev); if (rdev->desc->ops->get_error_flags) ret = rdev->desc->ops->get_error_flags(rdev, flags); else if (!rdev->use_cached_err) ret = -EINVAL; *flags |= cached_flags; regulator_unlock(rdev); return ret; } /** * regulator_get_error_flags - get regulator error information * @regulator: regulator source * @flags: pointer to store error flags * * Get the current regulator error information. * * Return: 0 on success or a negative error number on failure. */ int regulator_get_error_flags(struct regulator *regulator, unsigned int *flags) { return _regulator_get_error_flags(regulator->rdev, flags); } EXPORT_SYMBOL_GPL(regulator_get_error_flags); /** * regulator_set_load - set regulator load * @regulator: regulator source * @uA_load: load current * * Notifies the regulator core of a new device load. This is then used by * DRMS (if enabled by constraints) to set the most efficient regulator * operating mode for the new regulator loading. * * Consumer devices notify their supply regulator of the maximum power * they will require (can be taken from device datasheet in the power * consumption tables) when they change operational status and hence power * state. Examples of operational state changes that can affect power * consumption are :- * * o Device is opened / closed. * o Device I/O is about to begin or has just finished. * o Device is idling in between work. * * This information is also exported via sysfs to userspace. * * DRMS will sum the total requested load on the regulator and change * to the most efficient operating mode if platform constraints allow. * * NOTE: when a regulator consumer requests to have a regulator * disabled then any load that consumer requested no longer counts * toward the total requested load. If the regulator is re-enabled * then the previously requested load will start counting again. * * If a regulator is an always-on regulator then an individual consumer's * load will still be removed if that consumer is fully disabled. * * Return: 0 on success or a negative error number on failure. */ int regulator_set_load(struct regulator *regulator, int uA_load) { struct regulator_dev *rdev = regulator->rdev; int old_uA_load; int ret = 0; regulator_lock(rdev); old_uA_load = regulator->uA_load; regulator->uA_load = uA_load; if (regulator->enable_count && old_uA_load != uA_load) { ret = drms_uA_update(rdev); if (ret < 0) regulator->uA_load = old_uA_load; } regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_set_load); /** * regulator_allow_bypass - allow the regulator to go into bypass mode * * @regulator: Regulator to configure * @enable: enable or disable bypass mode * * Allow the regulator to go into bypass mode if all other consumers * for the regulator also enable bypass mode and the machine * constraints allow this. Bypass mode means that the regulator is * simply passing the input directly to the output with no regulation. * * Return: 0 on success or if changing bypass is not possible, or * a negative error number on failure. */ int regulator_allow_bypass(struct regulator *regulator, bool enable) { struct regulator_dev *rdev = regulator->rdev; const char *name = rdev_get_name(rdev); int ret = 0; if (!rdev->desc->ops->set_bypass) return 0; if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_BYPASS)) return 0; regulator_lock(rdev); if (enable && !regulator->bypass) { rdev->bypass_count++; if (rdev->bypass_count == rdev->open_count) { trace_regulator_bypass_enable(name); ret = rdev->desc->ops->set_bypass(rdev, enable); if (ret != 0) rdev->bypass_count--; else trace_regulator_bypass_enable_complete(name); } } else if (!enable && regulator->bypass) { rdev->bypass_count--; if (rdev->bypass_count != rdev->open_count) { trace_regulator_bypass_disable(name); ret = rdev->desc->ops->set_bypass(rdev, enable); if (ret != 0) rdev->bypass_count++; else trace_regulator_bypass_disable_complete(name); } } if (ret == 0) regulator->bypass = enable; regulator_unlock(rdev); return ret; } EXPORT_SYMBOL_GPL(regulator_allow_bypass); /** * regulator_register_notifier - register regulator event notifier * @regulator: regulator source * @nb: notifier block * * Register notifier block to receive regulator events. * * Return: 0 on success or a negative error number on failure. */ int regulator_register_notifier(struct regulator *regulator, struct notifier_block *nb) { return blocking_notifier_chain_register(&regulator->rdev->notifier, nb); } EXPORT_SYMBOL_GPL(regulator_register_notifier); /** * regulator_unregister_notifier - unregister regulator event notifier * @regulator: regulator source * @nb: notifier block * * Unregister regulator event notifier block. * * Return: 0 on success or a negative error number on failure. */ int regulator_unregister_notifier(struct regulator *regulator, struct notifier_block *nb) { return blocking_notifier_chain_unregister(&regulator->rdev->notifier, nb); } EXPORT_SYMBOL_GPL(regulator_unregister_notifier); /* notify regulator consumers and downstream regulator consumers. * Note mutex must be held by caller. */ static int _notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { /* call rdev chain first */ int ret = blocking_notifier_call_chain(&rdev->notifier, event, data); if (IS_REACHABLE(CONFIG_REGULATOR_NETLINK_EVENTS)) { struct device *parent = rdev->dev.parent; const char *rname = rdev_get_name(rdev); char name[32]; /* Avoid duplicate debugfs directory names */ if (parent && rname == rdev->desc->name) { snprintf(name, sizeof(name), "%s-%s", dev_name(parent), rname); rname = name; } reg_generate_netlink_event(rname, event); } return ret; } int _regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers, enum regulator_get_type get_type) { int i; int ret; for (i = 0; i < num_consumers; i++) consumers[i].consumer = NULL; for (i = 0; i < num_consumers; i++) { consumers[i].consumer = _regulator_get(dev, consumers[i].supply, get_type); if (IS_ERR(consumers[i].consumer)) { ret = dev_err_probe(dev, PTR_ERR(consumers[i].consumer), "Failed to get supply '%s'\n", consumers[i].supply); consumers[i].consumer = NULL; goto err; } if (consumers[i].init_load_uA > 0) { ret = regulator_set_load(consumers[i].consumer, consumers[i].init_load_uA); if (ret) { i++; goto err; } } } return 0; err: while (--i >= 0) regulator_put(consumers[i].consumer); return ret; } /** * regulator_bulk_get - get multiple regulator consumers * * @dev: Device to supply * @num_consumers: Number of consumers to register * @consumers: Configuration of consumers; clients are stored here. * * This helper function allows drivers to get several regulator * consumers in one operation. If any of the regulators cannot be * acquired then any regulators that were allocated will be freed * before returning to the caller. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers) { return _regulator_bulk_get(dev, num_consumers, consumers, NORMAL_GET); } EXPORT_SYMBOL_GPL(regulator_bulk_get); static void regulator_bulk_enable_async(void *data, async_cookie_t cookie) { struct regulator_bulk_data *bulk = data; bulk->ret = regulator_enable(bulk->consumer); } /** * regulator_bulk_enable - enable multiple regulator consumers * * @num_consumers: Number of consumers * @consumers: Consumer data; clients are stored here. * * This convenience API allows consumers to enable multiple regulator * clients in a single API call. If any consumers cannot be enabled * then any others that were enabled will be disabled again prior to * return. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_enable(int num_consumers, struct regulator_bulk_data *consumers) { ASYNC_DOMAIN_EXCLUSIVE(async_domain); int i; int ret = 0; for (i = 0; i < num_consumers; i++) { async_schedule_domain(regulator_bulk_enable_async, &consumers[i], &async_domain); } async_synchronize_full_domain(&async_domain); /* If any consumer failed we need to unwind any that succeeded */ for (i = 0; i < num_consumers; i++) { if (consumers[i].ret != 0) { ret = consumers[i].ret; goto err; } } return 0; err: for (i = 0; i < num_consumers; i++) { if (consumers[i].ret < 0) pr_err("Failed to enable %s: %pe\n", consumers[i].supply, ERR_PTR(consumers[i].ret)); else regulator_disable(consumers[i].consumer); } return ret; } EXPORT_SYMBOL_GPL(regulator_bulk_enable); /** * regulator_bulk_disable - disable multiple regulator consumers * * @num_consumers: Number of consumers * @consumers: Consumer data; clients are stored here. * * This convenience API allows consumers to disable multiple regulator * clients in a single API call. If any consumers cannot be disabled * then any others that were disabled will be enabled again prior to * return. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_disable(int num_consumers, struct regulator_bulk_data *consumers) { int i; int ret, r; for (i = num_consumers - 1; i >= 0; --i) { ret = regulator_disable(consumers[i].consumer); if (ret != 0) goto err; } return 0; err: pr_err("Failed to disable %s: %pe\n", consumers[i].supply, ERR_PTR(ret)); for (++i; i < num_consumers; ++i) { r = regulator_enable(consumers[i].consumer); if (r != 0) pr_err("Failed to re-enable %s: %pe\n", consumers[i].supply, ERR_PTR(r)); } return ret; } EXPORT_SYMBOL_GPL(regulator_bulk_disable); /** * regulator_bulk_force_disable - force disable multiple regulator consumers * * @num_consumers: Number of consumers * @consumers: Consumer data; clients are stored here. * * This convenience API allows consumers to forcibly disable multiple regulator * clients in a single API call. * NOTE: This should be used for situations when device damage will * likely occur if the regulators are not disabled (e.g. over temp). * Although regulator_force_disable function call for some consumers can * return error numbers, the function is called for all consumers. * * Return: 0 on success or a negative error number on failure. */ int regulator_bulk_force_disable(int num_consumers, struct regulator_bulk_data *consumers) { int i; int ret = 0; for (i = 0; i < num_consumers; i++) { consumers[i].ret = regulator_force_disable(consumers[i].consumer); /* Store first error for reporting */ if (consumers[i].ret && !ret) ret = consumers[i].ret; } return ret; } EXPORT_SYMBOL_GPL(regulator_bulk_force_disable); /** * regulator_bulk_free - free multiple regulator consumers * * @num_consumers: Number of consumers * @consumers: Consumer data; clients are stored here. * * This convenience API allows consumers to free multiple regulator * clients in a single API call. */ void regulator_bulk_free(int num_consumers, struct regulator_bulk_data *consumers) { int i; for (i = 0; i < num_consumers; i++) { regulator_put(consumers[i].consumer); consumers[i].consumer = NULL; } } EXPORT_SYMBOL_GPL(regulator_bulk_free); /** * regulator_handle_critical - Handle events for system-critical regulators. * @rdev: The regulator device. * @event: The event being handled. * * This function handles critical events such as under-voltage, over-current, * and unknown errors for regulators deemed system-critical. On detecting such * events, it triggers a hardware protection shutdown with a defined timeout. */ static void regulator_handle_critical(struct regulator_dev *rdev, unsigned long event) { const char *reason = NULL; if (!rdev->constraints->system_critical) return; switch (event) { case REGULATOR_EVENT_UNDER_VOLTAGE: reason = "System critical regulator: voltage drop detected"; break; case REGULATOR_EVENT_OVER_CURRENT: reason = "System critical regulator: over-current detected"; break; case REGULATOR_EVENT_FAIL: reason = "System critical regulator: unknown error"; } if (!reason) return; hw_protection_trigger(reason, rdev->constraints->uv_less_critical_window_ms); } /** * regulator_notifier_call_chain - call regulator event notifier * @rdev: regulator source * @event: notifier block * @data: callback-specific data. * * Called by regulator drivers to notify clients a regulator event has * occurred. * * Return: %NOTIFY_DONE. */ int regulator_notifier_call_chain(struct regulator_dev *rdev, unsigned long event, void *data) { regulator_handle_critical(rdev, event); _notifier_call_chain(rdev, event, data); return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(regulator_notifier_call_chain); /** * regulator_mode_to_status - convert a regulator mode into a status * * @mode: Mode to convert * * Convert a regulator mode into a status. * * Return: %REGULATOR_STATUS_* value corresponding to given mode. */ int regulator_mode_to_status(unsigned int mode) { switch (mode) { case REGULATOR_MODE_FAST: return REGULATOR_STATUS_FAST; case REGULATOR_MODE_NORMAL: return REGULATOR_STATUS_NORMAL; case REGULATOR_MODE_IDLE: return REGULATOR_STATUS_IDLE; case REGULATOR_MODE_STANDBY: return REGULATOR_STATUS_STANDBY; default: return REGULATOR_STATUS_UNDEFINED; } } EXPORT_SYMBOL_GPL(regulator_mode_to_status); static struct attribute *regulator_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_num_users.attr, &dev_attr_type.attr, &dev_attr_microvolts.attr, &dev_attr_microamps.attr, &dev_attr_opmode.attr, &dev_attr_state.attr, &dev_attr_status.attr, &dev_attr_bypass.attr, &dev_attr_requested_microamps.attr, &dev_attr_min_microvolts.attr, &dev_attr_max_microvolts.attr, &dev_attr_min_microamps.attr, &dev_attr_max_microamps.attr, &dev_attr_under_voltage.attr, &dev_attr_over_current.attr, &dev_attr_regulation_out.attr, &dev_attr_fail.attr, &dev_attr_over_temp.attr, &dev_attr_under_voltage_warn.attr, &dev_attr_over_current_warn.attr, &dev_attr_over_voltage_warn.attr, &dev_attr_over_temp_warn.attr, &dev_attr_suspend_standby_state.attr, &dev_attr_suspend_mem_state.attr, &dev_attr_suspend_disk_state.attr, &dev_attr_suspend_standby_microvolts.attr, &dev_attr_suspend_mem_microvolts.attr, &dev_attr_suspend_disk_microvolts.attr, &dev_attr_suspend_standby_mode.attr, &dev_attr_suspend_mem_mode.attr, &dev_attr_suspend_disk_mode.attr, &dev_attr_power_budget_milliwatt.attr, &dev_attr_power_requested_milliwatt.attr, NULL }; /* * To avoid cluttering sysfs (and memory) with useless state, only * create attributes that can be meaningfully displayed. */ static umode_t regulator_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { struct device *dev = kobj_to_dev(kobj); struct regulator_dev *rdev = dev_to_rdev(dev); const struct regulator_ops *ops = rdev->desc->ops; umode_t mode = attr->mode; /* these three are always present */ if (attr == &dev_attr_name.attr || attr == &dev_attr_num_users.attr || attr == &dev_attr_type.attr) return mode; /* some attributes need specific methods to be displayed */ if (attr == &dev_attr_microvolts.attr) { if ((ops->get_voltage && ops->get_voltage(rdev) >= 0) || (ops->get_voltage_sel && ops->get_voltage_sel(rdev) >= 0) || (ops->list_voltage && ops->list_voltage(rdev, 0) >= 0) || (rdev->desc->fixed_uV && rdev->desc->n_voltages == 1)) return mode; return 0; } if (attr == &dev_attr_microamps.attr) return ops->get_current_limit ? mode : 0; if (attr == &dev_attr_opmode.attr) return ops->get_mode ? mode : 0; if (attr == &dev_attr_state.attr) return (rdev->ena_pin || ops->is_enabled) ? mode : 0; if (attr == &dev_attr_status.attr) return ops->get_status ? mode : 0; if (attr == &dev_attr_bypass.attr) return ops->get_bypass ? mode : 0; if (attr == &dev_attr_under_voltage.attr || attr == &dev_attr_over_current.attr || attr == &dev_attr_regulation_out.attr || attr == &dev_attr_fail.attr || attr == &dev_attr_over_temp.attr || attr == &dev_attr_under_voltage_warn.attr || attr == &dev_attr_over_current_warn.attr || attr == &dev_attr_over_voltage_warn.attr || attr == &dev_attr_over_temp_warn.attr) return ops->get_error_flags ? mode : 0; /* constraints need specific supporting methods */ if (attr == &dev_attr_min_microvolts.attr || attr == &dev_attr_max_microvolts.attr) return (ops->set_voltage || ops->set_voltage_sel) ? mode : 0; if (attr == &dev_attr_min_microamps.attr || attr == &dev_attr_max_microamps.attr) return ops->set_current_limit ? mode : 0; if (attr == &dev_attr_suspend_standby_state.attr || attr == &dev_attr_suspend_mem_state.attr || attr == &dev_attr_suspend_disk_state.attr) return mode; if (attr == &dev_attr_suspend_standby_microvolts.attr || attr == &dev_attr_suspend_mem_microvolts.attr || attr == &dev_attr_suspend_disk_microvolts.attr) return ops->set_suspend_voltage ? mode : 0; if (attr == &dev_attr_suspend_standby_mode.attr || attr == &dev_attr_suspend_mem_mode.attr || attr == &dev_attr_suspend_disk_mode.attr) return ops->set_suspend_mode ? mode : 0; if (attr == &dev_attr_power_budget_milliwatt.attr || attr == &dev_attr_power_requested_milliwatt.attr) return rdev->constraints->pw_budget_mW != INT_MAX ? mode : 0; return mode; } static const struct attribute_group regulator_dev_group = { .attrs = regulator_dev_attrs, .is_visible = regulator_attr_is_visible, }; static const struct attribute_group *regulator_dev_groups[] = { &regulator_dev_group, NULL }; static void regulator_dev_release(struct device *dev) { struct regulator_dev *rdev = dev_get_drvdata(dev); debugfs_remove_recursive(rdev->debugfs); kfree(rdev->constraints); of_node_put(rdev->dev.of_node); kfree(rdev); } static void rdev_init_debugfs(struct regulator_dev *rdev) { struct device *parent = rdev->dev.parent; const char *rname = rdev_get_name(rdev); char name[NAME_MAX]; /* Avoid duplicate debugfs directory names */ if (parent && rname == rdev->desc->name) { snprintf(name, sizeof(name), "%s-%s", dev_name(parent), rname); rname = name; } rdev->debugfs = debugfs_create_dir(rname, debugfs_root); if (IS_ERR(rdev->debugfs)) rdev_dbg(rdev, "Failed to create debugfs directory\n"); debugfs_create_u32("use_count", 0444, rdev->debugfs, &rdev->use_count); debugfs_create_u32("open_count", 0444, rdev->debugfs, &rdev->open_count); debugfs_create_u32("bypass_count", 0444, rdev->debugfs, &rdev->bypass_count); } static int regulator_register_resolve_supply(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); if (regulator_resolve_supply(rdev)) rdev_dbg(rdev, "unable to resolve supply\n"); return 0; } int regulator_coupler_register(struct regulator_coupler *coupler) { mutex_lock(&regulator_list_mutex); list_add_tail(&coupler->list, &regulator_coupler_list); mutex_unlock(&regulator_list_mutex); return 0; } static struct regulator_coupler * regulator_find_coupler(struct regulator_dev *rdev) { struct regulator_coupler *coupler; int err; /* * Note that regulators are appended to the list and the generic * coupler is registered first, hence it will be attached at last * if nobody cared. */ list_for_each_entry_reverse(coupler, &regulator_coupler_list, list) { err = coupler->attach_regulator(coupler, rdev); if (!err) { if (!coupler->balance_voltage && rdev->coupling_desc.n_coupled > 2) goto err_unsupported; return coupler; } if (err < 0) return ERR_PTR(err); if (err == 1) continue; break; } return ERR_PTR(-EINVAL); err_unsupported: if (coupler->detach_regulator) coupler->detach_regulator(coupler, rdev); rdev_err(rdev, "Voltage balancing for multiple regulator couples is unimplemented\n"); return ERR_PTR(-EPERM); } static void regulator_resolve_coupling(struct regulator_dev *rdev) { struct regulator_coupler *coupler = rdev->coupling_desc.coupler; struct coupling_desc *c_desc = &rdev->coupling_desc; int n_coupled = c_desc->n_coupled; struct regulator_dev *c_rdev; int i; for (i = 1; i < n_coupled; i++) { /* already resolved */ if (c_desc->coupled_rdevs[i]) continue; c_rdev = of_parse_coupled_regulator(rdev, i - 1); if (!c_rdev) continue; if (c_rdev->coupling_desc.coupler != coupler) { rdev_err(rdev, "coupler mismatch with %s\n", rdev_get_name(c_rdev)); return; } c_desc->coupled_rdevs[i] = c_rdev; c_desc->n_resolved++; regulator_resolve_coupling(c_rdev); } } static void regulator_remove_coupling(struct regulator_dev *rdev) { struct regulator_coupler *coupler = rdev->coupling_desc.coupler; struct coupling_desc *__c_desc, *c_desc = &rdev->coupling_desc; struct regulator_dev *__c_rdev, *c_rdev; unsigned int __n_coupled, n_coupled; int i, k; int err; n_coupled = c_desc->n_coupled; for (i = 1; i < n_coupled; i++) { c_rdev = c_desc->coupled_rdevs[i]; if (!c_rdev) continue; regulator_lock(c_rdev); __c_desc = &c_rdev->coupling_desc; __n_coupled = __c_desc->n_coupled; for (k = 1; k < __n_coupled; k++) { __c_rdev = __c_desc->coupled_rdevs[k]; if (__c_rdev == rdev) { __c_desc->coupled_rdevs[k] = NULL; __c_desc->n_resolved--; break; } } regulator_unlock(c_rdev); c_desc->coupled_rdevs[i] = NULL; c_desc->n_resolved--; } if (coupler && coupler->detach_regulator) { err = coupler->detach_regulator(coupler, rdev); if (err) rdev_err(rdev, "failed to detach from coupler: %pe\n", ERR_PTR(err)); } rdev->coupling_desc.n_coupled = 0; kfree(rdev->coupling_desc.coupled_rdevs); rdev->coupling_desc.coupled_rdevs = NULL; } static int regulator_init_coupling(struct regulator_dev *rdev) { struct regulator_dev **coupled; int err, n_phandles; if (!IS_ENABLED(CONFIG_OF)) n_phandles = 0; else n_phandles = of_get_n_coupled(rdev); coupled = kcalloc(n_phandles + 1, sizeof(*coupled), GFP_KERNEL); if (!coupled) return -ENOMEM; rdev->coupling_desc.coupled_rdevs = coupled; /* * Every regulator should always have coupling descriptor filled with * at least pointer to itself. */ rdev->coupling_desc.coupled_rdevs[0] = rdev; rdev->coupling_desc.n_coupled = n_phandles + 1; rdev->coupling_desc.n_resolved++; /* regulator isn't coupled */ if (n_phandles == 0) return 0; if (!of_check_coupling_data(rdev)) return -EPERM; mutex_lock(&regulator_list_mutex); rdev->coupling_desc.coupler = regulator_find_coupler(rdev); mutex_unlock(&regulator_list_mutex); if (IS_ERR(rdev->coupling_desc.coupler)) { err = PTR_ERR(rdev->coupling_desc.coupler); rdev_err(rdev, "failed to get coupler: %pe\n", ERR_PTR(err)); return err; } return 0; } static int generic_coupler_attach(struct regulator_coupler *coupler, struct regulator_dev *rdev) { if (rdev->coupling_desc.n_coupled > 2) { rdev_err(rdev, "Voltage balancing for multiple regulator couples is unimplemented\n"); return -EPERM; } if (!rdev->constraints->always_on) { rdev_err(rdev, "Coupling of a non always-on regulator is unimplemented\n"); return -ENOTSUPP; } return 0; } static struct regulator_coupler generic_regulator_coupler = { .attach_regulator = generic_coupler_attach, }; /** * regulator_register - register regulator * @dev: the device that drive the regulator * @regulator_desc: regulator to register * @cfg: runtime configuration for regulator * * Called by regulator drivers to register a regulator. * * Return: Pointer to a valid &struct regulator_dev on success or * an ERR_PTR() encoded negative error number on failure. */ struct regulator_dev * regulator_register(struct device *dev, const struct regulator_desc *regulator_desc, const struct regulator_config *cfg) { const struct regulator_init_data *init_data; struct regulator_config *config = NULL; static atomic_t regulator_no = ATOMIC_INIT(-1); struct regulator_dev *rdev; bool dangling_cfg_gpiod = false; bool dangling_of_gpiod = false; int ret, i; bool resolved_early = false; if (cfg == NULL) return ERR_PTR(-EINVAL); if (cfg->ena_gpiod) dangling_cfg_gpiod = true; if (regulator_desc == NULL) { ret = -EINVAL; goto rinse; } WARN_ON(!dev || !cfg->dev); if (regulator_desc->name == NULL || regulator_desc->ops == NULL) { ret = -EINVAL; goto rinse; } if (regulator_desc->type != REGULATOR_VOLTAGE && regulator_desc->type != REGULATOR_CURRENT) { ret = -EINVAL; goto rinse; } /* Only one of each should be implemented */ WARN_ON(regulator_desc->ops->get_voltage && regulator_desc->ops->get_voltage_sel); WARN_ON(regulator_desc->ops->set_voltage && regulator_desc->ops->set_voltage_sel); /* If we're using selectors we must implement list_voltage. */ if (regulator_desc->ops->get_voltage_sel && !regulator_desc->ops->list_voltage) { ret = -EINVAL; goto rinse; } if (regulator_desc->ops->set_voltage_sel && !regulator_desc->ops->list_voltage) { ret = -EINVAL; goto rinse; } rdev = kzalloc(sizeof(struct regulator_dev), GFP_KERNEL); if (rdev == NULL) { ret = -ENOMEM; goto rinse; } device_initialize(&rdev->dev); dev_set_drvdata(&rdev->dev, rdev); rdev->dev.class = &regulator_class; spin_lock_init(&rdev->err_lock); /* * Duplicate the config so the driver could override it after * parsing init data. */ config = kmemdup(cfg, sizeof(*cfg), GFP_KERNEL); if (config == NULL) { ret = -ENOMEM; goto clean; } /* * DT may override the config->init_data provided if the platform * needs to do so. If so, config->init_data is completely ignored. */ init_data = regulator_of_get_init_data(dev, regulator_desc, config, &rdev->dev.of_node); /* * Sometimes not all resources are probed already so we need to take * that into account. This happens most the time if the ena_gpiod comes * from a gpio extender or something else. */ if (PTR_ERR(init_data) == -EPROBE_DEFER) { ret = -EPROBE_DEFER; goto clean; } /* * We need to keep track of any GPIO descriptor coming from the * device tree until we have handled it over to the core. If the * config that was passed in to this function DOES NOT contain * a descriptor, and the config after this call DOES contain * a descriptor, we definitely got one from parsing the device * tree. */ if (!cfg->ena_gpiod && config->ena_gpiod) dangling_of_gpiod = true; if (!init_data) { init_data = config->init_data; rdev->dev.of_node = of_node_get(config->of_node); } ww_mutex_init(&rdev->mutex, &regulator_ww_class); rdev->reg_data = config->driver_data; rdev->owner = regulator_desc->owner; rdev->desc = regulator_desc; if (config->regmap) rdev->regmap = config->regmap; else if (dev_get_regmap(dev, NULL)) rdev->regmap = dev_get_regmap(dev, NULL); else if (dev->parent) rdev->regmap = dev_get_regmap(dev->parent, NULL); INIT_LIST_HEAD(&rdev->consumer_list); INIT_LIST_HEAD(&rdev->list); BLOCKING_INIT_NOTIFIER_HEAD(&rdev->notifier); INIT_DELAYED_WORK(&rdev->disable_work, regulator_disable_work); if (init_data && init_data->supply_regulator) rdev->supply_name = init_data->supply_regulator; else if (regulator_desc->supply_name) rdev->supply_name = regulator_desc->supply_name; /* register with sysfs */ rdev->dev.parent = config->dev; dev_set_name(&rdev->dev, "regulator.%lu", (unsigned long) atomic_inc_return(&regulator_no)); /* set regulator constraints */ if (init_data) rdev->constraints = kmemdup(&init_data->constraints, sizeof(*rdev->constraints), GFP_KERNEL); else rdev->constraints = kzalloc(sizeof(*rdev->constraints), GFP_KERNEL); if (!rdev->constraints) { ret = -ENOMEM; goto wash; } if (regulator_desc->init_cb) { ret = regulator_desc->init_cb(rdev, config); if (ret < 0) goto wash; } if ((rdev->supply_name && !rdev->supply) && (rdev->constraints->always_on || rdev->constraints->boot_on)) { ret = regulator_resolve_supply(rdev); if (ret) rdev_dbg(rdev, "unable to resolve supply early: %pe\n", ERR_PTR(ret)); resolved_early = true; } if (config->ena_gpiod) { ret = regulator_ena_gpio_request(rdev, config); if (ret != 0) { rdev_err(rdev, "Failed to request enable GPIO: %pe\n", ERR_PTR(ret)); goto wash; } /* The regulator core took over the GPIO descriptor */ dangling_cfg_gpiod = false; dangling_of_gpiod = false; } ret = set_machine_constraints(rdev); if (ret == -EPROBE_DEFER && !resolved_early) { /* Regulator might be in bypass mode and so needs its supply * to set the constraints */ /* FIXME: this currently triggers a chicken-and-egg problem * when creating -SUPPLY symlink in sysfs to a regulator * that is just being created */ rdev_dbg(rdev, "will resolve supply early: %s\n", rdev->supply_name); ret = regulator_resolve_supply(rdev); if (!ret) ret = set_machine_constraints(rdev); else rdev_dbg(rdev, "unable to resolve supply early: %pe\n", ERR_PTR(ret)); } if (ret < 0) goto wash; ret = regulator_init_coupling(rdev); if (ret < 0) goto wash; /* add consumers devices */ if (init_data) { for (i = 0; i < init_data->num_consumer_supplies; i++) { ret = set_consumer_device_supply(rdev, init_data->consumer_supplies[i].dev_name, init_data->consumer_supplies[i].supply); if (ret < 0) { dev_err(dev, "Failed to set supply %s\n", init_data->consumer_supplies[i].supply); goto unset_supplies; } } } if (!rdev->desc->ops->get_voltage && !rdev->desc->ops->list_voltage && !rdev->desc->fixed_uV) rdev->is_switch = true; ret = device_add(&rdev->dev); if (ret != 0) goto unset_supplies; rdev_init_debugfs(rdev); /* try to resolve regulators coupling since a new one was registered */ mutex_lock(&regulator_list_mutex); regulator_resolve_coupling(rdev); mutex_unlock(&regulator_list_mutex); /* try to resolve regulators supply since a new one was registered */ class_for_each_device(&regulator_class, NULL, NULL, regulator_register_resolve_supply); kfree(config); return rdev; unset_supplies: mutex_lock(&regulator_list_mutex); unset_regulator_supplies(rdev); regulator_remove_coupling(rdev); mutex_unlock(&regulator_list_mutex); wash: regulator_put(rdev->supply); kfree(rdev->coupling_desc.coupled_rdevs); mutex_lock(&regulator_list_mutex); regulator_ena_gpio_free(rdev); mutex_unlock(&regulator_list_mutex); clean: if (dangling_of_gpiod) gpiod_put(config->ena_gpiod); kfree(config); put_device(&rdev->dev); rinse: if (dangling_cfg_gpiod) gpiod_put(cfg->ena_gpiod); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(regulator_register); /** * regulator_unregister - unregister regulator * @rdev: regulator to unregister * * Called by regulator drivers to unregister a regulator. */ void regulator_unregister(struct regulator_dev *rdev) { if (rdev == NULL) return; if (rdev->supply) { while (rdev->use_count--) regulator_disable(rdev->supply); regulator_put(rdev->supply); } flush_work(&rdev->disable_work.work); mutex_lock(&regulator_list_mutex); WARN_ON(rdev->open_count); regulator_remove_coupling(rdev); unset_regulator_supplies(rdev); list_del(&rdev->list); regulator_ena_gpio_free(rdev); device_unregister(&rdev->dev); mutex_unlock(&regulator_list_mutex); } EXPORT_SYMBOL_GPL(regulator_unregister); #ifdef CONFIG_SUSPEND /** * regulator_suspend - prepare regulators for system wide suspend * @dev: ``&struct device`` pointer that is passed to _regulator_suspend() * * Configure each regulator with it's suspend operating parameters for state. * * Return: 0 on success or a negative error number on failure. */ static int regulator_suspend(struct device *dev) { struct regulator_dev *rdev = dev_to_rdev(dev); suspend_state_t state = pm_suspend_target_state; int ret; const struct regulator_state *rstate; rstate = regulator_get_suspend_state_check(rdev, state); if (!rstate) return 0; regulator_lock(rdev); ret = __suspend_set_state(rdev, rstate); regulator_unlock(rdev); return ret; } static int regulator_resume(struct device *dev) { suspend_state_t state = pm_suspend_target_state; struct regulator_dev *rdev = dev_to_rdev(dev); struct regulator_state *rstate; int ret = 0; rstate = regulator_get_suspend_state(rdev, state); if (rstate == NULL) return 0; /* Avoid grabbing the lock if we don't need to */ if (!rdev->desc->ops->resume) return 0; regulator_lock(rdev); if (rstate->enabled == ENABLE_IN_SUSPEND || rstate->enabled == DISABLE_IN_SUSPEND) ret = rdev->desc->ops->resume(rdev); regulator_unlock(rdev); return ret; } #else /* !CONFIG_SUSPEND */ #define regulator_suspend NULL #define regulator_resume NULL #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_PM static const struct dev_pm_ops __maybe_unused regulator_pm_ops = { .suspend = regulator_suspend, .resume = regulator_resume, }; #endif const struct class regulator_class = { .name = "regulator", .dev_release = regulator_dev_release, .dev_groups = regulator_dev_groups, #ifdef CONFIG_PM .pm = &regulator_pm_ops, #endif }; /** * regulator_has_full_constraints - the system has fully specified constraints * * Calling this function will cause the regulator API to disable all * regulators which have a zero use count and don't have an always_on * constraint in a late_initcall. * * The intention is that this will become the default behaviour in a * future kernel release so users are encouraged to use this facility * now. */ void regulator_has_full_constraints(void) { has_full_constraints = 1; } EXPORT_SYMBOL_GPL(regulator_has_full_constraints); /** * rdev_get_drvdata - get rdev regulator driver data * @rdev: regulator * * Get rdev regulator driver private data. This call can be used in the * regulator driver context. * * Return: Pointer to regulator driver private data. */ void *rdev_get_drvdata(struct regulator_dev *rdev) { return rdev->reg_data; } EXPORT_SYMBOL_GPL(rdev_get_drvdata); /** * regulator_get_drvdata - get regulator driver data * @regulator: regulator * * Get regulator driver private data. This call can be used in the consumer * driver context when non API regulator specific functions need to be called. * * Return: Pointer to regulator driver private data. */ void *regulator_get_drvdata(struct regulator *regulator) { return regulator->rdev->reg_data; } EXPORT_SYMBOL_GPL(regulator_get_drvdata); /** * regulator_set_drvdata - set regulator driver data * @regulator: regulator * @data: data */ void regulator_set_drvdata(struct regulator *regulator, void *data) { regulator->rdev->reg_data = data; } EXPORT_SYMBOL_GPL(regulator_set_drvdata); /** * rdev_get_id - get regulator ID * @rdev: regulator * * Return: Regulator ID for @rdev. */ int rdev_get_id(struct regulator_dev *rdev) { return rdev->desc->id; } EXPORT_SYMBOL_GPL(rdev_get_id); struct device *rdev_get_dev(struct regulator_dev *rdev) { return &rdev->dev; } EXPORT_SYMBOL_GPL(rdev_get_dev); struct regmap *rdev_get_regmap(struct regulator_dev *rdev) { return rdev->regmap; } EXPORT_SYMBOL_GPL(rdev_get_regmap); void *regulator_get_init_drvdata(struct regulator_init_data *reg_init_data) { return reg_init_data->driver_data; } EXPORT_SYMBOL_GPL(regulator_get_init_drvdata); #ifdef CONFIG_DEBUG_FS static int supply_map_show(struct seq_file *sf, void *data) { struct regulator_map *map; list_for_each_entry(map, &regulator_map_list, list) { seq_printf(sf, "%s -> %s.%s\n", rdev_get_name(map->regulator), map->dev_name, map->supply); } return 0; } DEFINE_SHOW_ATTRIBUTE(supply_map); struct summary_data { struct seq_file *s; struct regulator_dev *parent; int level; }; static void regulator_summary_show_subtree(struct seq_file *s, struct regulator_dev *rdev, int level); static int regulator_summary_show_children(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct summary_data *summary_data = data; if (rdev->supply && rdev->supply->rdev == summary_data->parent) regulator_summary_show_subtree(summary_data->s, rdev, summary_data->level + 1); return 0; } static void regulator_summary_show_subtree(struct seq_file *s, struct regulator_dev *rdev, int level) { struct regulation_constraints *c; struct regulator *consumer; struct summary_data summary_data; unsigned int opmode; if (!rdev) return; opmode = _regulator_get_mode_unlocked(rdev); seq_printf(s, "%*s%-*s %3d %4d %6d %7s ", level * 3 + 1, "", 30 - level * 3, rdev_get_name(rdev), rdev->use_count, rdev->open_count, rdev->bypass_count, regulator_opmode_to_str(opmode)); seq_printf(s, "%5dmV ", regulator_get_voltage_rdev(rdev) / 1000); seq_printf(s, "%5dmA ", _regulator_get_current_limit_unlocked(rdev) / 1000); c = rdev->constraints; if (c) { switch (rdev->desc->type) { case REGULATOR_VOLTAGE: seq_printf(s, "%5dmV %5dmV ", c->min_uV / 1000, c->max_uV / 1000); break; case REGULATOR_CURRENT: seq_printf(s, "%5dmA %5dmA ", c->min_uA / 1000, c->max_uA / 1000); break; } } seq_puts(s, "\n"); list_for_each_entry(consumer, &rdev->consumer_list, list) { if (consumer->dev && consumer->dev->class == &regulator_class) continue; seq_printf(s, "%*s%-*s ", (level + 1) * 3 + 1, "", 30 - (level + 1) * 3, consumer->supply_name ? consumer->supply_name : consumer->dev ? dev_name(consumer->dev) : "deviceless"); switch (rdev->desc->type) { case REGULATOR_VOLTAGE: seq_printf(s, "%3d %33dmA%c%5dmV %5dmV", consumer->enable_count, consumer->uA_load / 1000, consumer->uA_load && !consumer->enable_count ? '*' : ' ', consumer->voltage[PM_SUSPEND_ON].min_uV / 1000, consumer->voltage[PM_SUSPEND_ON].max_uV / 1000); break; case REGULATOR_CURRENT: break; } seq_puts(s, "\n"); } summary_data.s = s; summary_data.level = level; summary_data.parent = rdev; class_for_each_device(&regulator_class, NULL, &summary_data, regulator_summary_show_children); } struct summary_lock_data { struct ww_acquire_ctx *ww_ctx; struct regulator_dev **new_contended_rdev; struct regulator_dev **old_contended_rdev; }; static int regulator_summary_lock_one(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct summary_lock_data *lock_data = data; int ret = 0; if (rdev != *lock_data->old_contended_rdev) { ret = regulator_lock_nested(rdev, lock_data->ww_ctx); if (ret == -EDEADLK) *lock_data->new_contended_rdev = rdev; else WARN_ON_ONCE(ret); } else { *lock_data->old_contended_rdev = NULL; } return ret; } static int regulator_summary_unlock_one(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct summary_lock_data *lock_data = data; if (lock_data) { if (rdev == *lock_data->new_contended_rdev) return -EDEADLK; } regulator_unlock(rdev); return 0; } static int regulator_summary_lock_all(struct ww_acquire_ctx *ww_ctx, struct regulator_dev **new_contended_rdev, struct regulator_dev **old_contended_rdev) { struct summary_lock_data lock_data; int ret; lock_data.ww_ctx = ww_ctx; lock_data.new_contended_rdev = new_contended_rdev; lock_data.old_contended_rdev = old_contended_rdev; ret = class_for_each_device(&regulator_class, NULL, &lock_data, regulator_summary_lock_one); if (ret) class_for_each_device(&regulator_class, NULL, &lock_data, regulator_summary_unlock_one); return ret; } static void regulator_summary_lock(struct ww_acquire_ctx *ww_ctx) { struct regulator_dev *new_contended_rdev = NULL; struct regulator_dev *old_contended_rdev = NULL; int err; mutex_lock(&regulator_list_mutex); ww_acquire_init(ww_ctx, &regulator_ww_class); do { if (new_contended_rdev) { ww_mutex_lock_slow(&new_contended_rdev->mutex, ww_ctx); old_contended_rdev = new_contended_rdev; old_contended_rdev->ref_cnt++; old_contended_rdev->mutex_owner = current; } err = regulator_summary_lock_all(ww_ctx, &new_contended_rdev, &old_contended_rdev); if (old_contended_rdev) regulator_unlock(old_contended_rdev); } while (err == -EDEADLK); ww_acquire_done(ww_ctx); } static void regulator_summary_unlock(struct ww_acquire_ctx *ww_ctx) { class_for_each_device(&regulator_class, NULL, NULL, regulator_summary_unlock_one); ww_acquire_fini(ww_ctx); mutex_unlock(&regulator_list_mutex); } static int regulator_summary_show_roots(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct seq_file *s = data; if (!rdev->supply) regulator_summary_show_subtree(s, rdev, 0); return 0; } static int regulator_summary_show(struct seq_file *s, void *data) { struct ww_acquire_ctx ww_ctx; seq_puts(s, " regulator use open bypass opmode voltage current min max\n"); seq_puts(s, "---------------------------------------------------------------------------------------\n"); regulator_summary_lock(&ww_ctx); class_for_each_device(&regulator_class, NULL, s, regulator_summary_show_roots); regulator_summary_unlock(&ww_ctx); return 0; } DEFINE_SHOW_ATTRIBUTE(regulator_summary); #endif /* CONFIG_DEBUG_FS */ static int __init regulator_init(void) { int ret; ret = class_register(&regulator_class); debugfs_root = debugfs_create_dir("regulator", NULL); if (IS_ERR(debugfs_root)) pr_debug("regulator: Failed to create debugfs directory\n"); #ifdef CONFIG_DEBUG_FS debugfs_create_file("supply_map", 0444, debugfs_root, NULL, &supply_map_fops); debugfs_create_file("regulator_summary", 0444, debugfs_root, NULL, &regulator_summary_fops); #endif regulator_dummy_init(); regulator_coupler_register(&generic_regulator_coupler); return ret; } /* init early to allow our consumers to complete system booting */ core_initcall(regulator_init); static int regulator_late_cleanup(struct device *dev, void *data) { struct regulator_dev *rdev = dev_to_rdev(dev); struct regulation_constraints *c = rdev->constraints; int ret; if (c && c->always_on) return 0; if (!regulator_ops_is_valid(rdev, REGULATOR_CHANGE_STATUS)) return 0; regulator_lock(rdev); if (rdev->use_count) goto unlock; /* If reading the status failed, assume that it's off. */ if (_regulator_is_enabled(rdev) <= 0) goto unlock; if (have_full_constraints()) { /* We log since this may kill the system if it goes * wrong. */ rdev_info(rdev, "disabling\n"); ret = _regulator_do_disable(rdev); if (ret != 0) rdev_err(rdev, "couldn't disable: %pe\n", ERR_PTR(ret)); } else { /* The intention is that in future we will * assume that full constraints are provided * so warn even if we aren't going to do * anything here. */ rdev_warn(rdev, "incomplete constraints, leaving on\n"); } unlock: regulator_unlock(rdev); return 0; } static bool regulator_ignore_unused; static int __init regulator_ignore_unused_setup(char *__unused) { regulator_ignore_unused = true; return 1; } __setup("regulator_ignore_unused", regulator_ignore_unused_setup); static void regulator_init_complete_work_function(struct work_struct *work) { /* * Regulators may had failed to resolve their input supplies * when were registered, either because the input supply was * not registered yet or because its parent device was not * bound yet. So attempt to resolve the input supplies for * pending regulators before trying to disable unused ones. */ class_for_each_device(&regulator_class, NULL, NULL, regulator_register_resolve_supply); /* * For debugging purposes, it may be useful to prevent unused * regulators from being disabled. */ if (regulator_ignore_unused) { pr_warn("regulator: Not disabling unused regulators\n"); return; } /* If we have a full configuration then disable any regulators * we have permission to change the status for and which are * not in use or always_on. This is effectively the default * for DT and ACPI as they have full constraints. */ class_for_each_device(&regulator_class, NULL, NULL, regulator_late_cleanup); } static DECLARE_DELAYED_WORK(regulator_init_complete_work, regulator_init_complete_work_function); static int __init regulator_init_complete(void) { /* * Since DT doesn't provide an idiomatic mechanism for * enabling full constraints and since it's much more natural * with DT to provide them just assume that a DT enabled * system has full constraints. */ if (of_have_populated_dt()) has_full_constraints = true; /* * We punt completion for an arbitrary amount of time since * systems like distros will load many drivers from userspace * so consumers might not always be ready yet, this is * particularly an issue with laptops where this might bounce * the display off then on. Ideally we'd get a notification * from userspace when this happens but we don't so just wait * a bit and hope we waited long enough. It'd be better if * we'd only do this on systems that need it, and a kernel * command line option might be useful. */ schedule_delayed_work(&regulator_init_complete_work, msecs_to_jiffies(30000)); return 0; } late_initcall_sync(regulator_init_complete);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_IP6_TUNNEL_H #define _NET_IP6_TUNNEL_H #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_tunnel.h> #include <linux/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/dst_cache.h> #define IP6TUNNEL_ERR_TIMEO (30*HZ) /* capable of sending packets */ #define IP6_TNL_F_CAP_XMIT 0x10000 /* capable of receiving packets */ #define IP6_TNL_F_CAP_RCV 0x20000 /* determine capability on a per-packet basis */ #define IP6_TNL_F_CAP_PER_PACKET 0x40000 struct __ip6_tnl_parm { char name[IFNAMSIZ]; /* name of tunnel device */ int link; /* ifindex of underlying L2 interface */ __u8 proto; /* tunnel protocol */ __u8 encap_limit; /* encapsulation limit for tunnel */ __u8 hop_limit; /* hop limit for tunnel */ bool collect_md; __be32 flowinfo; /* traffic class and flowlabel for tunnel */ __u32 flags; /* tunnel flags */ struct in6_addr laddr; /* local tunnel end-point address */ struct in6_addr raddr; /* remote tunnel end-point address */ IP_TUNNEL_DECLARE_FLAGS(i_flags); IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; __u32 fwmark; __u32 index; /* ERSPAN type II index */ __u8 erspan_ver; /* ERSPAN version */ __u8 dir; /* direction */ __u16 hwid; /* hwid */ }; /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ struct net_device *dev; /* virtual device associated with tunnel */ netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ struct dst_cache dst_cache; /* cached dst */ struct gro_cells gro_cells; int err_count; unsigned long err_time; /* These fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ int encap_hlen; /* Encap header length (FOU,GUE) */ struct ip_tunnel_encap encap; int mlink; }; struct ip6_tnl_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi6 *fl6); int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); }; #ifdef CONFIG_INET extern const struct ip6_tnl_encap_ops __rcu * ip6tun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap); static inline int ip6_encap_hlen(struct ip_tunnel_encap *e) { const struct ip6_tnl_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t, u8 *protocol, struct flowi6 *fl6) { const struct ip6_tnl_encap_ops *ops; int ret = -EINVAL; if (t->encap.type == TUNNEL_ENCAP_NONE) return 0; if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(ip6tun_encaps[t->encap.type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, &t->encap, protocol, fl6); rcu_read_unlock(); return ret; } /* Tunnel encapsulation limit destination sub-option */ struct ipv6_tlv_tnl_enc_lim { __u8 type; /* type-code for option */ __u8 length; /* option length */ __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto); __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw); __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); struct net *ip6_tnl_get_link_net(const struct net_device *dev); int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device *dev, u16 ip6cb_flags) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(skb_dst_dev_net(skb), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) pkt_len = -1; iptunnel_xmit_stats(dev, pkt_len); } } #endif #endif
8 13 7 14 6 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 /* SPDX-License-Identifier: GPL-2.0-only */ /* * vivid-core.h - core datastructures * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #ifndef _VIVID_CORE_H_ #define _VIVID_CORE_H_ #include <linux/fb.h> #include <linux/workqueue.h> #include <media/cec.h> #include <media/videobuf2-v4l2.h> #include <media/v4l2-device.h> #include <media/v4l2-dev.h> #include <media/v4l2-ctrls.h> #include <media/tpg/v4l2-tpg.h> #include "vivid-rds-gen.h" #include "vivid-vbi-gen.h" #define dprintk(dev, level, fmt, arg...) \ v4l2_dbg(level, vivid_debug, &dev->v4l2_dev, fmt, ## arg) /* The maximum number of inputs */ #define MAX_INPUTS 16 /* The maximum number of outputs */ #define MAX_OUTPUTS 16 /* The maximum number of video capture buffers */ #define MAX_VID_CAP_BUFFERS 64 /* The maximum up or down scaling factor is 4 */ #define MAX_ZOOM 4 /* The maximum image width/height are set to 4K DMT */ #define MAX_WIDTH 4096 #define MAX_HEIGHT 2160 /* The minimum image width/height */ #define MIN_WIDTH 16 #define MIN_HEIGHT MIN_WIDTH /* Pixel Array control divider */ #define PIXEL_ARRAY_DIV MIN_WIDTH /* The data_offset of plane 0 for the multiplanar formats */ #define PLANE0_DATA_OFFSET 128 /* The supported TV frequency range in MHz */ #define MIN_TV_FREQ (44U * 16U) #define MAX_TV_FREQ (958U * 16U) /* The number of samples returned in every SDR buffer */ #define SDR_CAP_SAMPLES_PER_BUF 0x4000 /* used by the threads to know when to resync internal counters */ #define JIFFIES_PER_DAY (3600U * 24U * HZ) #define JIFFIES_RESYNC (JIFFIES_PER_DAY * (0xf0000000U / JIFFIES_PER_DAY)) /* * Maximum number of HDMI inputs allowed by vivid, due to limitations * of the Physical Address in the EDID and used by CEC we stop at 15 * inputs and outputs. */ #define MAX_HDMI_INPUTS 15 #define MAX_HDMI_OUTPUTS 15 /* Maximum number of S-Video inputs allowed by vivid */ #define MAX_SVID_INPUTS 16 /* The maximum number of items in a menu control */ #define MAX_MENU_ITEMS BITS_PER_LONG_LONG /* Number of fixed menu items in the 'Connected To' menu controls */ #define FIXED_MENU_ITEMS 2 /* The maximum number of vivid devices */ #define VIVID_MAX_DEVS CONFIG_VIDEO_VIVID_MAX_DEVS extern const struct v4l2_rect vivid_min_rect; extern const struct v4l2_rect vivid_max_rect; extern unsigned vivid_debug; /* * NULL-terminated string array for the HDMI 'Connected To' menu controls * with the list of possible HDMI outputs. * * The first two items are fixed ("TPG" and "None"). */ extern char *vivid_ctrl_hdmi_to_output_strings[1 + MAX_MENU_ITEMS]; /* Menu control skip mask of all HDMI outputs that are in use */ extern u64 hdmi_to_output_menu_skip_mask; /* * Bitmask of which vivid instances need to update any connected * HDMI outputs. */ extern u64 hdmi_input_update_outputs_mask; /* * Spinlock for access to hdmi_to_output_menu_skip_mask and * hdmi_input_update_outputs_mask. */ extern spinlock_t hdmi_output_skip_mask_lock; /* * Workqueue that updates the menu controls whenever the HDMI menu skip mask * changes. */ extern struct workqueue_struct *update_hdmi_ctrls_workqueue; /* * The HDMI menu control value (index in the menu list) maps to an HDMI * output that is part of the given vivid_dev instance and has the given * output index (as returned by VIDIOC_G_OUTPUT). * * NULL/0 if not available. */ extern struct vivid_dev *vivid_ctrl_hdmi_to_output_instance[MAX_MENU_ITEMS]; extern unsigned int vivid_ctrl_hdmi_to_output_index[MAX_MENU_ITEMS]; /* * NULL-terminated string array for the S-Video 'Connected To' menu controls * with the list of possible S-Video outputs. * * The first two items are fixed ("TPG" and "None"). */ extern char *vivid_ctrl_svid_to_output_strings[1 + MAX_MENU_ITEMS]; /* Menu control skip mask of all S-Video outputs that are in use */ extern u64 svid_to_output_menu_skip_mask; /* Spinlock for access to svid_to_output_menu_skip_mask */ extern spinlock_t svid_output_skip_mask_lock; /* * Workqueue that updates the menu controls whenever the S-Video menu skip mask * changes. */ extern struct workqueue_struct *update_svid_ctrls_workqueue; /* * The S-Video menu control value (index in the menu list) maps to an S-Video * output that is part of the given vivid_dev instance and has the given * output index (as returned by VIDIOC_G_OUTPUT). * * NULL/0 if not available. */ extern struct vivid_dev *vivid_ctrl_svid_to_output_instance[MAX_MENU_ITEMS]; extern unsigned int vivid_ctrl_svid_to_output_index[MAX_MENU_ITEMS]; extern struct vivid_dev *vivid_devs[VIVID_MAX_DEVS]; extern unsigned int n_devs; struct vivid_fmt { u32 fourcc; /* v4l2 format id */ enum tgp_color_enc color_enc; bool can_do_overlay; u8 vdownsampling[TPG_MAX_PLANES]; u32 alpha_mask; u8 planes; u8 buffers; u32 data_offset[TPG_MAX_PLANES]; u32 bit_depth[TPG_MAX_PLANES]; }; extern struct vivid_fmt vivid_formats[]; /* buffer for one video frame */ struct vivid_buffer { /* common v4l buffer stuff -- must be first */ struct vb2_v4l2_buffer vb; struct list_head list; }; enum vivid_input { WEBCAM, TV, SVID, HDMI, }; enum vivid_signal_mode { CURRENT_DV_TIMINGS, CURRENT_STD = CURRENT_DV_TIMINGS, NO_SIGNAL, NO_LOCK, OUT_OF_RANGE, SELECTED_DV_TIMINGS, SELECTED_STD = SELECTED_DV_TIMINGS, CYCLE_DV_TIMINGS, CYCLE_STD = CYCLE_DV_TIMINGS, CUSTOM_DV_TIMINGS, }; enum vivid_colorspace { VIVID_CS_170M, VIVID_CS_709, VIVID_CS_SRGB, VIVID_CS_OPRGB, VIVID_CS_2020, VIVID_CS_DCI_P3, VIVID_CS_240M, VIVID_CS_SYS_M, VIVID_CS_SYS_BG, }; #define VIVID_INVALID_SIGNAL(mode) \ ((mode) == NO_SIGNAL || (mode) == NO_LOCK || (mode) == OUT_OF_RANGE) struct vivid_cec_xfer { struct cec_adapter *adap; u8 msg[CEC_MAX_MSG_SIZE]; u32 len; u32 sft; }; struct vivid_dev { u8 inst; struct v4l2_device v4l2_dev; #ifdef CONFIG_MEDIA_CONTROLLER struct media_device mdev; struct media_pad vid_cap_pad; struct media_pad vid_out_pad; struct media_pad vbi_cap_pad; struct media_pad vbi_out_pad; struct media_pad sdr_cap_pad; struct media_pad meta_cap_pad; struct media_pad meta_out_pad; struct media_pad touch_cap_pad; #endif struct v4l2_ctrl_handler ctrl_hdl_user_gen; struct v4l2_ctrl_handler ctrl_hdl_user_vid; struct v4l2_ctrl_handler ctrl_hdl_user_aud; struct v4l2_ctrl_handler ctrl_hdl_streaming; struct v4l2_ctrl_handler ctrl_hdl_sdtv_cap; struct v4l2_ctrl_handler ctrl_hdl_loop_cap; struct v4l2_ctrl_handler ctrl_hdl_fb; struct video_device vid_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_vid_cap; struct video_device vid_out_dev; struct v4l2_ctrl_handler ctrl_hdl_vid_out; struct video_device vbi_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_vbi_cap; struct video_device vbi_out_dev; struct v4l2_ctrl_handler ctrl_hdl_vbi_out; struct video_device radio_rx_dev; struct v4l2_ctrl_handler ctrl_hdl_radio_rx; struct video_device radio_tx_dev; struct v4l2_ctrl_handler ctrl_hdl_radio_tx; struct video_device sdr_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_sdr_cap; struct video_device meta_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_meta_cap; struct video_device meta_out_dev; struct v4l2_ctrl_handler ctrl_hdl_meta_out; struct video_device touch_cap_dev; struct v4l2_ctrl_handler ctrl_hdl_touch_cap; spinlock_t slock; struct mutex mutex; struct work_struct update_hdmi_ctrl_work; struct work_struct update_svid_ctrl_work; /* capabilities */ u32 vid_cap_caps; u32 vid_out_caps; u32 vbi_cap_caps; u32 vbi_out_caps; u32 sdr_cap_caps; u32 radio_rx_caps; u32 radio_tx_caps; u32 meta_cap_caps; u32 meta_out_caps; u32 touch_cap_caps; /* supported features */ bool multiplanar; u8 num_inputs; u8 num_hdmi_inputs; u8 num_svid_inputs; u8 input_type[MAX_INPUTS]; u8 input_name_counter[MAX_INPUTS]; u8 num_outputs; u8 num_hdmi_outputs; u8 output_type[MAX_OUTPUTS]; u8 output_name_counter[MAX_OUTPUTS]; bool has_audio_inputs; bool has_audio_outputs; bool has_vid_cap; bool has_vid_out; bool has_vbi_cap; bool has_raw_vbi_cap; bool has_sliced_vbi_cap; bool has_vbi_out; bool has_raw_vbi_out; bool has_sliced_vbi_out; bool has_radio_rx; bool has_radio_tx; bool has_sdr_cap; bool has_fb; bool has_meta_cap; bool has_meta_out; bool has_tv_tuner; bool has_touch_cap; /* Output index (0-MAX_OUTPUTS) to vivid instance of connected input */ struct vivid_dev *output_to_input_instance[MAX_OUTPUTS]; /* Output index (0-MAX_OUTPUTS) to input index (0-MAX_INPUTS) of connected input */ u8 output_to_input_index[MAX_OUTPUTS]; /* Output index (0-MAX_OUTPUTS) to HDMI or S-Video output index (0-MAX_HDMI/SVID_OUTPUTS) */ u8 output_to_iface_index[MAX_OUTPUTS]; /* ctrl_hdmi_to_output or ctrl_svid_to_output control value for each input */ s32 input_is_connected_to_output[MAX_INPUTS]; /* HDMI index (0-MAX_HDMI_OUTPUTS) to output index (0-MAX_OUTPUTS) */ u8 hdmi_index_to_output_index[MAX_HDMI_OUTPUTS]; /* HDMI index (0-MAX_HDMI_INPUTS) to input index (0-MAX_INPUTS) */ u8 hdmi_index_to_input_index[MAX_HDMI_INPUTS]; /* S-Video index (0-MAX_SVID_INPUTS) to input index (0-MAX_INPUTS) */ u8 svid_index_to_input_index[MAX_SVID_INPUTS]; /* controls */ struct v4l2_ctrl *brightness; struct v4l2_ctrl *contrast; struct v4l2_ctrl *saturation; struct v4l2_ctrl *hue; struct { /* autogain/gain cluster */ struct v4l2_ctrl *autogain; struct v4l2_ctrl *gain; }; struct v4l2_ctrl *volume; struct v4l2_ctrl *mute; struct v4l2_ctrl *alpha; struct v4l2_ctrl *button; struct v4l2_ctrl *boolean; struct v4l2_ctrl *int32; struct v4l2_ctrl *int64; struct v4l2_ctrl *menu; struct v4l2_ctrl *string; struct v4l2_ctrl *bitmask; struct v4l2_ctrl *int_menu; struct v4l2_ctrl *ro_int32; struct v4l2_ctrl *pixel_array; struct v4l2_ctrl *test_pattern; struct v4l2_ctrl *colorspace; struct v4l2_ctrl *rgb_range_cap; struct v4l2_ctrl *real_rgb_range_cap; struct { /* std_signal_mode/standard cluster */ struct v4l2_ctrl *ctrl_std_signal_mode; struct v4l2_ctrl *ctrl_standard; }; struct { /* dv_timings_signal_mode/timings cluster */ struct v4l2_ctrl *ctrl_dv_timings_signal_mode; struct v4l2_ctrl *ctrl_dv_timings; }; struct v4l2_ctrl *ctrl_has_crop_cap; struct v4l2_ctrl *ctrl_has_compose_cap; struct v4l2_ctrl *ctrl_has_scaler_cap; struct v4l2_ctrl *ctrl_has_crop_out; struct v4l2_ctrl *ctrl_has_compose_out; struct v4l2_ctrl *ctrl_has_scaler_out; struct v4l2_ctrl *ctrl_tx_mode; struct v4l2_ctrl *ctrl_tx_rgb_range; struct v4l2_ctrl *ctrl_tx_edid_present; struct v4l2_ctrl *ctrl_tx_hotplug; struct v4l2_ctrl *ctrl_tx_rxsense; struct v4l2_ctrl *ctrl_rx_power_present; struct v4l2_ctrl *radio_tx_rds_pi; struct v4l2_ctrl *radio_tx_rds_pty; struct v4l2_ctrl *radio_tx_rds_mono_stereo; struct v4l2_ctrl *radio_tx_rds_art_head; struct v4l2_ctrl *radio_tx_rds_compressed; struct v4l2_ctrl *radio_tx_rds_dyn_pty; struct v4l2_ctrl *radio_tx_rds_ta; struct v4l2_ctrl *radio_tx_rds_tp; struct v4l2_ctrl *radio_tx_rds_ms; struct v4l2_ctrl *radio_tx_rds_psname; struct v4l2_ctrl *radio_tx_rds_radiotext; struct v4l2_ctrl *radio_rx_rds_pty; struct v4l2_ctrl *radio_rx_rds_ta; struct v4l2_ctrl *radio_rx_rds_tp; struct v4l2_ctrl *radio_rx_rds_ms; struct v4l2_ctrl *radio_rx_rds_psname; struct v4l2_ctrl *radio_rx_rds_radiotext; struct v4l2_ctrl *ctrl_hdmi_to_output[MAX_HDMI_INPUTS]; char ctrl_hdmi_to_output_names[MAX_HDMI_INPUTS][32]; struct v4l2_ctrl *ctrl_svid_to_output[MAX_SVID_INPUTS]; char ctrl_svid_to_output_names[MAX_SVID_INPUTS][32]; unsigned input_brightness[MAX_INPUTS]; unsigned osd_mode; unsigned button_pressed; bool sensor_hflip; bool sensor_vflip; bool hflip; bool vflip; bool vbi_cap_interlaced; bool loop_video; bool reduced_fps; /* Framebuffer */ unsigned long video_pbase; void *video_vbase; u32 video_buffer_size; int display_width; int display_height; int display_byte_stride; int bits_per_pixel; int bytes_per_pixel; #ifdef CONFIG_VIDEO_VIVID_OSD struct fb_info fb_info; struct fb_var_screeninfo fb_defined; struct fb_fix_screeninfo fb_fix; #endif /* Error injection */ bool disconnect_error; bool queue_setup_error; bool buf_prepare_error; bool start_streaming_error; bool dqbuf_error; bool req_validate_error; bool seq_wrap; u64 time_wrap; u64 time_wrap_offset; unsigned perc_dropped_buffers; enum vivid_signal_mode std_signal_mode[MAX_INPUTS]; unsigned int query_std_last[MAX_INPUTS]; v4l2_std_id query_std[MAX_INPUTS]; enum tpg_video_aspect std_aspect_ratio[MAX_INPUTS]; enum vivid_signal_mode dv_timings_signal_mode[MAX_INPUTS]; char **query_dv_timings_qmenu; char *query_dv_timings_qmenu_strings; unsigned query_dv_timings_size; unsigned int query_dv_timings_last[MAX_INPUTS]; unsigned int query_dv_timings[MAX_INPUTS]; enum tpg_video_aspect dv_timings_aspect_ratio[MAX_INPUTS]; /* Input */ unsigned input; v4l2_std_id std_cap[MAX_INPUTS]; struct v4l2_dv_timings dv_timings_cap[MAX_INPUTS]; int dv_timings_cap_sel[MAX_INPUTS]; u32 service_set_cap; struct vivid_vbi_gen_data vbi_gen; u8 *edid; unsigned edid_blocks; unsigned edid_max_blocks; unsigned webcam_size_idx; unsigned webcam_ival_idx; unsigned tv_freq; unsigned tv_audmode; unsigned tv_field_cap; unsigned tv_audio_input; u32 power_present; /* Output */ unsigned output; v4l2_std_id std_out; struct v4l2_dv_timings dv_timings_out; u32 colorspace_out; u32 ycbcr_enc_out; u32 hsv_enc_out; u32 quantization_out; u32 xfer_func_out; u32 service_set_out; unsigned bytesperline_out[TPG_MAX_PLANES]; unsigned tv_field_out; unsigned tv_audio_output; bool vbi_out_have_wss; u8 vbi_out_wss[2]; bool vbi_out_have_cc[2]; u8 vbi_out_cc[2][2]; bool dvi_d_out; u8 *scaled_line; u8 *blended_line; unsigned cur_scaled_line; /* Output Overlay */ void *fb_vbase_out; bool overlay_out_enabled; int overlay_out_top, overlay_out_left; unsigned fbuf_out_flags; u32 chromakey_out; u8 global_alpha_out; /* video capture */ struct tpg_data tpg; unsigned ms_vid_cap; bool must_blank[MAX_VID_CAP_BUFFERS]; const struct vivid_fmt *fmt_cap; struct v4l2_fract timeperframe_vid_cap; enum v4l2_field field_cap; struct v4l2_rect src_rect; struct v4l2_rect fmt_cap_rect; struct v4l2_rect crop_cap; struct v4l2_rect compose_cap; struct v4l2_rect crop_bounds_cap; struct vb2_queue vb_vid_cap_q; struct list_head vid_cap_active; struct vb2_queue vb_vbi_cap_q; struct list_head vbi_cap_active; struct vb2_queue vb_meta_cap_q; struct list_head meta_cap_active; struct vb2_queue vb_touch_cap_q; struct list_head touch_cap_active; /* thread for generating video capture stream */ struct task_struct *kthread_vid_cap; unsigned long jiffies_vid_cap; u64 cap_stream_start; u64 cap_frame_period; u64 cap_frame_eof_offset; u32 cap_seq_offset; u32 cap_seq_count; bool cap_seq_resync; u32 vid_cap_seq_start; u32 vid_cap_seq_count; bool vid_cap_streaming; u32 vbi_cap_seq_start; u32 vbi_cap_seq_count; bool vbi_cap_streaming; u32 meta_cap_seq_start; u32 meta_cap_seq_count; bool meta_cap_streaming; /* Touch capture */ struct task_struct *kthread_touch_cap; unsigned long jiffies_touch_cap; u64 touch_cap_stream_start; u32 touch_cap_seq_offset; bool touch_cap_seq_resync; u32 touch_cap_seq_start; u32 touch_cap_seq_count; u32 touch_cap_with_seq_wrap_count; bool touch_cap_streaming; struct v4l2_fract timeperframe_tch_cap; struct v4l2_pix_format tch_format; int tch_pat_random; /* video output */ const struct vivid_fmt *fmt_out; struct v4l2_fract timeperframe_vid_out; enum v4l2_field field_out; struct v4l2_rect sink_rect; struct v4l2_rect fmt_out_rect; struct v4l2_rect crop_out; struct v4l2_rect compose_out; struct v4l2_rect compose_bounds_out; struct vb2_queue vb_vid_out_q; struct list_head vid_out_active; struct vb2_queue vb_vbi_out_q; struct list_head vbi_out_active; struct vb2_queue vb_meta_out_q; struct list_head meta_out_active; /* video loop precalculated rectangles */ /* * Intersection between what the output side composes and the capture side * crops. I.e., what actually needs to be copied from the output buffer to * the capture buffer. */ struct v4l2_rect loop_vid_copy; /* The part of the output buffer that (after scaling) corresponds to loop_vid_copy. */ struct v4l2_rect loop_vid_out; /* The part of the capture buffer that (after scaling) corresponds to loop_vid_copy. */ struct v4l2_rect loop_vid_cap; /* * The intersection of the framebuffer, the overlay output window and * loop_vid_copy. I.e., the part of the framebuffer that actually should be * blended with the compose_out rectangle. This uses the framebuffer origin. */ struct v4l2_rect loop_fb_copy; /* The same as loop_fb_copy but with compose_out origin. */ struct v4l2_rect loop_vid_overlay; /* * The part of the capture buffer that (after scaling) corresponds * to loop_vid_overlay. */ struct v4l2_rect loop_vid_overlay_cap; /* thread for generating video output stream */ struct task_struct *kthread_vid_out; unsigned long jiffies_vid_out; u32 out_seq_offset; u32 out_seq_count; bool out_seq_resync; u32 vid_out_seq_start; u32 vid_out_seq_count; bool vid_out_streaming; u32 vbi_out_seq_start; u32 vbi_out_seq_count; bool vbi_out_streaming; bool stream_sliced_vbi_out; u32 meta_out_seq_start; u32 meta_out_seq_count; bool meta_out_streaming; /* SDR capture */ struct vb2_queue vb_sdr_cap_q; struct list_head sdr_cap_active; u32 sdr_pixelformat; /* v4l2 format id */ unsigned sdr_buffersize; unsigned sdr_adc_freq; unsigned sdr_fm_freq; unsigned sdr_fm_deviation; int sdr_fixp_src_phase; int sdr_fixp_mod_phase; bool tstamp_src_is_soe; bool has_crop_cap; bool has_compose_cap; bool has_scaler_cap; bool has_crop_out; bool has_compose_out; bool has_scaler_out; /* thread for generating SDR stream */ struct task_struct *kthread_sdr_cap; unsigned long jiffies_sdr_cap; u32 sdr_cap_seq_offset; u32 sdr_cap_seq_start; u32 sdr_cap_seq_count; u32 sdr_cap_with_seq_wrap_count; bool sdr_cap_seq_resync; /* RDS generator */ struct vivid_rds_gen rds_gen; /* Radio receiver */ unsigned radio_rx_freq; unsigned radio_rx_audmode; int radio_rx_sig_qual; unsigned radio_rx_hw_seek_mode; bool radio_rx_hw_seek_prog_lim; bool radio_rx_rds_controls; bool radio_rx_rds_enabled; unsigned radio_rx_rds_use_alternates; unsigned radio_rx_rds_last_block; struct v4l2_fh *radio_rx_rds_owner; /* Radio transmitter */ unsigned radio_tx_freq; unsigned radio_tx_subchans; bool radio_tx_rds_controls; unsigned radio_tx_rds_last_block; struct v4l2_fh *radio_tx_rds_owner; /* Shared between radio receiver and transmitter */ bool radio_rds_loop; ktime_t radio_rds_init_time; /* CEC */ struct cec_adapter *cec_rx_adap; struct cec_adapter *cec_tx_adap[MAX_HDMI_OUTPUTS]; struct task_struct *kthread_cec; wait_queue_head_t kthread_waitq_cec; struct vivid_cec_xfer xfers[MAX_OUTPUTS]; spinlock_t cec_xfers_slock; /* read and write cec messages */ u32 cec_sft; /* bus signal free time, in bit periods */ u8 last_initiator; /* CEC OSD String */ char osd[14]; unsigned long osd_jiffies; bool meta_pts; bool meta_scr; }; static inline bool vivid_is_webcam(const struct vivid_dev *dev) { return dev->input_type[dev->input] == WEBCAM; } static inline bool vivid_is_tv_cap(const struct vivid_dev *dev) { return dev->input_type[dev->input] == TV; } static inline bool vivid_is_svid_cap(const struct vivid_dev *dev) { return dev->input_type[dev->input] == SVID; } static inline bool vivid_is_hdmi_cap(const struct vivid_dev *dev) { return dev->input_type[dev->input] == HDMI; } static inline bool vivid_is_sdtv_cap(const struct vivid_dev *dev) { return vivid_is_tv_cap(dev) || vivid_is_svid_cap(dev); } static inline bool vivid_is_svid_out(const struct vivid_dev *dev) { return dev->output_type[dev->output] == SVID; } static inline bool vivid_is_hdmi_out(const struct vivid_dev *dev) { return dev->output_type[dev->output] == HDMI; } #endif
1 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM handshake #if !defined(_TRACE_HANDSHAKE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_HANDSHAKE_H #include <linux/net.h> #include <net/tls_prot.h> #include <linux/tracepoint.h> #include <trace/events/net_probe_common.h> #define TLS_RECORD_TYPE_LIST \ record_type(CHANGE_CIPHER_SPEC) \ record_type(ALERT) \ record_type(HANDSHAKE) \ record_type(DATA) \ record_type(HEARTBEAT) \ record_type(TLS12_CID) \ record_type_end(ACK) #undef record_type #undef record_type_end #define record_type(x) TRACE_DEFINE_ENUM(TLS_RECORD_TYPE_##x); #define record_type_end(x) TRACE_DEFINE_ENUM(TLS_RECORD_TYPE_##x); TLS_RECORD_TYPE_LIST #undef record_type #undef record_type_end #define record_type(x) { TLS_RECORD_TYPE_##x, #x }, #define record_type_end(x) { TLS_RECORD_TYPE_##x, #x } #define show_tls_content_type(type) \ __print_symbolic(type, TLS_RECORD_TYPE_LIST) TRACE_DEFINE_ENUM(TLS_ALERT_LEVEL_WARNING); TRACE_DEFINE_ENUM(TLS_ALERT_LEVEL_FATAL); #define show_tls_alert_level(level) \ __print_symbolic(level, \ { TLS_ALERT_LEVEL_WARNING, "Warning" }, \ { TLS_ALERT_LEVEL_FATAL, "Fatal" }) #define TLS_ALERT_DESCRIPTION_LIST \ alert_description(CLOSE_NOTIFY) \ alert_description(UNEXPECTED_MESSAGE) \ alert_description(BAD_RECORD_MAC) \ alert_description(RECORD_OVERFLOW) \ alert_description(HANDSHAKE_FAILURE) \ alert_description(BAD_CERTIFICATE) \ alert_description(UNSUPPORTED_CERTIFICATE) \ alert_description(CERTIFICATE_REVOKED) \ alert_description(CERTIFICATE_EXPIRED) \ alert_description(CERTIFICATE_UNKNOWN) \ alert_description(ILLEGAL_PARAMETER) \ alert_description(UNKNOWN_CA) \ alert_description(ACCESS_DENIED) \ alert_description(DECODE_ERROR) \ alert_description(DECRYPT_ERROR) \ alert_description(TOO_MANY_CIDS_REQUESTED) \ alert_description(PROTOCOL_VERSION) \ alert_description(INSUFFICIENT_SECURITY) \ alert_description(INTERNAL_ERROR) \ alert_description(INAPPROPRIATE_FALLBACK) \ alert_description(USER_CANCELED) \ alert_description(MISSING_EXTENSION) \ alert_description(UNSUPPORTED_EXTENSION) \ alert_description(UNRECOGNIZED_NAME) \ alert_description(BAD_CERTIFICATE_STATUS_RESPONSE) \ alert_description(UNKNOWN_PSK_IDENTITY) \ alert_description(CERTIFICATE_REQUIRED) \ alert_description_end(NO_APPLICATION_PROTOCOL) #undef alert_description #undef alert_description_end #define alert_description(x) TRACE_DEFINE_ENUM(TLS_ALERT_DESC_##x); #define alert_description_end(x) TRACE_DEFINE_ENUM(TLS_ALERT_DESC_##x); TLS_ALERT_DESCRIPTION_LIST #undef alert_description #undef alert_description_end #define alert_description(x) { TLS_ALERT_DESC_##x, #x }, #define alert_description_end(x) { TLS_ALERT_DESC_##x, #x } #define show_tls_alert_description(desc) \ __print_symbolic(desc, TLS_ALERT_DESCRIPTION_LIST) DECLARE_EVENT_CLASS(handshake_event_class, TP_PROTO( const struct net *net, const struct handshake_req *req, const struct sock *sk ), TP_ARGS(net, req, sk), TP_STRUCT__entry( __field(const void *, req) __field(const void *, sk) __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->req = req; __entry->sk = sk; __entry->netns_ino = net->ns.inum; ), TP_printk("req=%p sk=%p", __entry->req, __entry->sk ) ); #define DEFINE_HANDSHAKE_EVENT(name) \ DEFINE_EVENT(handshake_event_class, name, \ TP_PROTO( \ const struct net *net, \ const struct handshake_req *req, \ const struct sock *sk \ ), \ TP_ARGS(net, req, sk)) DECLARE_EVENT_CLASS(handshake_fd_class, TP_PROTO( const struct net *net, const struct handshake_req *req, const struct sock *sk, int fd ), TP_ARGS(net, req, sk, fd), TP_STRUCT__entry( __field(const void *, req) __field(const void *, sk) __field(int, fd) __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->req = req; __entry->sk = req->hr_sk; __entry->fd = fd; __entry->netns_ino = net->ns.inum; ), TP_printk("req=%p sk=%p fd=%d", __entry->req, __entry->sk, __entry->fd ) ); #define DEFINE_HANDSHAKE_FD_EVENT(name) \ DEFINE_EVENT(handshake_fd_class, name, \ TP_PROTO( \ const struct net *net, \ const struct handshake_req *req, \ const struct sock *sk, \ int fd \ ), \ TP_ARGS(net, req, sk, fd)) DECLARE_EVENT_CLASS(handshake_error_class, TP_PROTO( const struct net *net, const struct handshake_req *req, const struct sock *sk, int err ), TP_ARGS(net, req, sk, err), TP_STRUCT__entry( __field(const void *, req) __field(const void *, sk) __field(int, err) __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->req = req; __entry->sk = sk; __entry->err = err; __entry->netns_ino = net->ns.inum; ), TP_printk("req=%p sk=%p err=%d", __entry->req, __entry->sk, __entry->err ) ); #define DEFINE_HANDSHAKE_ERROR(name) \ DEFINE_EVENT(handshake_error_class, name, \ TP_PROTO( \ const struct net *net, \ const struct handshake_req *req, \ const struct sock *sk, \ int err \ ), \ TP_ARGS(net, req, sk, err)) DECLARE_EVENT_CLASS(handshake_alert_class, TP_PROTO( const struct sock *sk, unsigned char level, unsigned char description ), TP_ARGS(sk, level, description), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(unsigned int, netns_ino) __field(unsigned long, level) __field(unsigned long, description) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); __entry->netns_ino = sock_net(sk)->ns.inum; __entry->level = level; __entry->description = description; ), TP_printk("src=%pISpc dest=%pISpc %s: %s", __entry->saddr, __entry->daddr, show_tls_alert_level(__entry->level), show_tls_alert_description(__entry->description) ) ); #define DEFINE_HANDSHAKE_ALERT(name) \ DEFINE_EVENT(handshake_alert_class, name, \ TP_PROTO( \ const struct sock *sk, \ unsigned char level, \ unsigned char description \ ), \ TP_ARGS(sk, level, description)) /* * Request lifetime events */ DEFINE_HANDSHAKE_EVENT(handshake_submit); DEFINE_HANDSHAKE_ERROR(handshake_submit_err); DEFINE_HANDSHAKE_EVENT(handshake_cancel); DEFINE_HANDSHAKE_EVENT(handshake_cancel_none); DEFINE_HANDSHAKE_EVENT(handshake_cancel_busy); DEFINE_HANDSHAKE_EVENT(handshake_destruct); TRACE_EVENT(handshake_complete, TP_PROTO( const struct net *net, const struct handshake_req *req, const struct sock *sk, int status ), TP_ARGS(net, req, sk, status), TP_STRUCT__entry( __field(const void *, req) __field(const void *, sk) __field(int, status) __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->req = req; __entry->sk = sk; __entry->status = status; __entry->netns_ino = net->ns.inum; ), TP_printk("req=%p sk=%p status=%d", __entry->req, __entry->sk, __entry->status ) ); /* * Netlink events */ DEFINE_HANDSHAKE_ERROR(handshake_notify_err); DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_accept); DEFINE_HANDSHAKE_ERROR(handshake_cmd_accept_err); DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_done); DEFINE_HANDSHAKE_ERROR(handshake_cmd_done_err); /* * TLS Record events */ TRACE_EVENT(tls_contenttype, TP_PROTO( const struct sock *sk, unsigned char type ), TP_ARGS(sk, type), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(unsigned int, netns_ino) __field(unsigned long, type) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); __entry->netns_ino = sock_net(sk)->ns.inum; __entry->type = type; ), TP_printk("src=%pISpc dest=%pISpc %s", __entry->saddr, __entry->daddr, show_tls_content_type(__entry->type) ) ); /* * TLS Alert events */ DEFINE_HANDSHAKE_ALERT(tls_alert_send); DEFINE_HANDSHAKE_ALERT(tls_alert_recv); #endif /* _TRACE_HANDSHAKE_H */ #include <trace/define_trace.h>
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical algorithm * * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H void rcu_softirq_qs(void); void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(void); void rcu_cpu_stall_reset(void); void rcu_request_urgent_qs_task(struct task_struct *t); /* * Note a virtualization-based context switch. This is simply a * wrapper around rcu_note_context_switch(), which allows TINY_RCU * to save a few bytes. The caller must have disabled interrupts. */ static inline void rcu_virt_note_context_switch(void) { rcu_note_context_switch(false); } void synchronize_rcu_expedited(void); void rcu_barrier(void); void rcu_momentary_eqs(void); struct rcu_gp_oldstate { unsigned long rgos_norm; unsigned long rgos_exp; }; // Maximum number of rcu_gp_oldstate values corresponding to // not-yet-completed RCU grace periods. #define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4 /** * same_state_synchronize_rcu_full - Are two old-state values identical? * @rgosp1: First old-state value. * @rgosp2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), * or get_completed_synchronize_rcu_full(). Returns @true if the two * values are identical and @false otherwise. This allows structures * whose lifetimes are tracked by old-state values to push these values * to a list header, allowing those structures to be slightly smaller. * * Note that equality is judged on a bitwise basis, so that an * @rcu_gp_oldstate structure with an already-completed state in one field * will compare not-equal to a structure with an already-completed state * in the other field. After all, the @rcu_gp_oldstate structure is opaque * so how did such a situation come to pass in the first place? */ static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2) { return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp; } unsigned long start_poll_synchronize_rcu_expedited(void); void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu_expedited(unsigned long oldstate); void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp); unsigned long get_state_synchronize_rcu(void); void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); unsigned long start_poll_synchronize_rcu(void); void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); bool poll_state_synchronize_rcu(unsigned long oldstate); bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); void cond_synchronize_rcu(unsigned long oldstate); void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); #ifdef CONFIG_PROVE_RCU void rcu_irq_exit_check_preempt(void); #else static inline void rcu_irq_exit_check_preempt(void) { } #endif struct task_struct; void rcu_preempt_deferred_qs(struct task_struct *t); void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); #ifndef CONFIG_PREEMPT_RCU void rcu_all_qs(void); #endif /* RCUtree hotplug events */ int rcutree_prepare_cpu(unsigned int cpu); int rcutree_online_cpu(unsigned int cpu); void rcutree_report_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int rcutree_dead_cpu(unsigned int cpu); int rcutree_dying_cpu(unsigned int cpu); int rcutree_offline_cpu(unsigned int cpu); #else #define rcutree_dead_cpu NULL #define rcutree_dying_cpu NULL #define rcutree_offline_cpu NULL #endif void rcutree_migrate_callbacks(int cpu); /* Called from hotplug and also arm64 early secondary boot failure */ void rcutree_report_cpu_dead(void); #endif /* __LINUX_RCUTREE_H */
1213 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_STRINGHASH_H #define __LINUX_STRINGHASH_H #include <linux/compiler.h> /* For __pure */ #include <linux/types.h> /* For u32, u64 */ #include <linux/hash.h> /* * Routines for hashing strings of bytes to a 32-bit hash value. * * These hash functions are NOT GUARANTEED STABLE between kernel * versions, architectures, or even repeated boots of the same kernel. * (E.g. they may depend on boot-time hardware detection or be * deliberately randomized.) * * They are also not intended to be secure against collisions caused by * malicious inputs; much slower hash functions are required for that. * * They are optimized for pathname components, meaning short strings. * Even if a majority of files have longer names, the dynamic profile of * pathname components skews short due to short directory names. * (E.g. /usr/lib/libsesquipedalianism.so.3.141.) */ /* * Version 1: one byte at a time. Example of use: * * unsigned long hash = init_name_hash; * while (*p) * hash = partial_name_hash(tolower(*p++), hash); * hash = end_name_hash(hash); * * Although this is designed for bytes, fs/hfsplus/unicode.c * abuses it to hash 16-bit values. */ /* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ #define init_name_hash(salt) (unsigned long)(salt) /* partial hash update function. Assume roughly 4 bits per character */ static inline unsigned long partial_name_hash(unsigned long c, unsigned long prevhash) { return (prevhash + (c << 4) + (c >> 4)) * 11; } /* * Finally: cut down the number of bits to a int value (and try to avoid * losing bits). This also has the property (wanted by the dcache) * that the msbits make a good hash table index. */ static inline unsigned int end_name_hash(unsigned long hash) { return hash_long(hash, 32); } /* * Version 2: One word (32 or 64 bits) at a time. * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h> * exists, which describes major Linux platforms like x86 and ARM), then * this computes a different hash function much faster. * * If not set, this falls back to a wrapper around the preceding. */ extern unsigned int __pure full_name_hash(const void *salt, const char *, unsigned int); /* * A hash_len is a u64 with the hash of a string in the low * half and the length in the high half. */ #define hashlen_hash(hashlen) ((u32)(hashlen)) #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) #define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash)) /* Return the "hash_len" (hash and length) of a null-terminated string */ extern u64 __pure hashlen_string(const void *salt, const char *name); #endif /* __LINUX_STRINGHASH_H */
58 58 58 59 57 59 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Resilient Queued Spin Lock * * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. * * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> */ #ifndef __ASM_GENERIC_RQSPINLOCK_H #define __ASM_GENERIC_RQSPINLOCK_H #include <linux/types.h> #include <vdso/time64.h> #include <linux/percpu.h> #ifdef CONFIG_QUEUED_SPINLOCKS #include <asm/qspinlock.h> #endif struct rqspinlock { union { atomic_t val; u32 locked; }; }; /* Even though this is same as struct rqspinlock, we need to emit a distinct * type in BTF for BPF programs. */ struct bpf_res_spin_lock { u32 val; }; struct qspinlock; #ifdef CONFIG_QUEUED_SPINLOCKS typedef struct qspinlock rqspinlock_t; #else typedef struct rqspinlock rqspinlock_t; #endif extern int resilient_tas_spin_lock(rqspinlock_t *lock); #ifdef CONFIG_QUEUED_SPINLOCKS extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); #endif #ifndef resilient_virt_spin_lock_enabled static __always_inline bool resilient_virt_spin_lock_enabled(void) { return false; } #endif #ifndef resilient_virt_spin_lock static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) { return 0; } #endif /* * Default timeout for waiting loops is 0.25 seconds */ #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) /* * Choose 31 as it makes rqspinlock_held cacheline-aligned. */ #define RES_NR_HELD 31 struct rqspinlock_held { int cnt; void *locks[RES_NR_HELD]; }; DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); static __always_inline void grab_held_lock_entry(void *lock) { int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); if (unlikely(cnt > RES_NR_HELD)) { /* Still keep the inc so we decrement later. */ return; } /* * Implied compiler barrier in per-CPU operations; otherwise we can have * the compiler reorder inc with write to table, allowing interrupts to * overwrite and erase our write to the table (as on interrupt exit it * will be reset to NULL). * * It is fine for cnt inc to be reordered wrt remote readers though, * they won't observe our entry until the cnt update is visible, that's * all. */ this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); } /* * We simply don't support out-of-order unlocks, and keep the logic simple here. * The verifier prevents BPF programs from unlocking out-of-order, and the same * holds for in-kernel users. * * It is possible to run into misdetection scenarios of AA deadlocks on the same * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct * logic to preserve right entries in the table would be to walk the array of * held locks and swap and clear out-of-order entries, but that's too * complicated and we don't have a compelling use case for out of order unlocking. */ static __always_inline void release_held_lock_entry(void) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); if (unlikely(rqh->cnt > RES_NR_HELD)) goto dec; WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); dec: /* * Reordering of clearing above with inc and its write in * grab_held_lock_entry that came before us (in same acquisition * attempt) is ok, we either see a valid entry or NULL when it's * visible. * * But this helper is invoked when we unwind upon failing to acquire the * lock. Unlike the unlock path which constitutes a release store after * we clear the entry, we need to emit a write barrier here. Otherwise, * we may have a situation as follows: * * <error> for lock B * release_held_lock_entry * * try_cmpxchg_acquire for lock A * grab_held_lock_entry * * Lack of any ordering means reordering may occur such that dec, inc * are done before entry is overwritten. This permits a remote lock * holder of lock B (which this CPU failed to acquire) to now observe it * as being attempted on this CPU, and may lead to misdetection (if this * CPU holds a lock it is attempting to acquire, leading to false ABBA * diagnosis). * * In case of unlock, we will always do a release on the lock word after * releasing the entry, ensuring that other CPUs cannot hold the lock * (and make conclusions about deadlocks) until the entry has been * cleared on the local CPU, preventing any anomalies. Reordering is * still possible there, but a remote CPU cannot observe a lock in our * table which it is already holding, since visibility entails our * release store for the said lock has not retired. * * In theory we don't have a problem if the dec and WRITE_ONCE above get * reordered with each other, we either notice an empty NULL entry on * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which * cannot be observed (if dec precedes WRITE_ONCE). * * Emit the write barrier _before_ the dec, this permits dec-inc * reordering but that is harmless as we'd have new entry set to NULL * already, i.e. they cannot precede the NULL store above. */ smp_wmb(); this_cpu_dec(rqspinlock_held_locks.cnt); } #ifdef CONFIG_QUEUED_SPINLOCKS /** * res_spin_lock - acquire a queued spinlock * @lock: Pointer to queued spinlock structure * * Return: * * 0 - Lock was acquired successfully. * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. * * -ETIMEDOUT - Lock acquisition failed because of timeout. */ static __always_inline int res_spin_lock(rqspinlock_t *lock) { int val = 0; if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { grab_held_lock_entry(lock); return 0; } return resilient_queued_spin_lock_slowpath(lock, val); } #else #define res_spin_lock(lock) resilient_tas_spin_lock(lock) #endif /* CONFIG_QUEUED_SPINLOCKS */ static __always_inline void res_spin_unlock(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); if (unlikely(rqh->cnt > RES_NR_HELD)) goto unlock; WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); unlock: /* * Release barrier, ensures correct ordering. See release_held_lock_entry * for details. Perform release store instead of queued_spin_unlock, * since we use this function for test-and-set fallback as well. When we * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. * * Like release_held_lock_entry, we can do the release before the dec. * We simply care about not seeing the 'lock' in our table from a remote * CPU once the lock has been released, which doesn't rely on the dec. * * Unlike smp_wmb(), release is not a two way fence, hence it is * possible for a inc to move up and reorder with our clearing of the * entry. This isn't a problem however, as for a misdiagnosis of ABBA, * the remote CPU needs to hold this lock, which won't be released until * the store below is done, which would ensure the entry is overwritten * to NULL, etc. */ smp_store_release(&lock->locked, 0); this_cpu_dec(rqspinlock_held_locks.cnt); } #ifdef CONFIG_QUEUED_SPINLOCKS #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) #else #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) #endif #define raw_res_spin_lock(lock) \ ({ \ int __ret; \ preempt_disable(); \ __ret = res_spin_lock(lock); \ if (__ret) \ preempt_enable(); \ __ret; \ }) #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) #define raw_res_spin_lock_irqsave(lock, flags) \ ({ \ int __ret; \ local_irq_save(flags); \ __ret = raw_res_spin_lock(lock); \ if (__ret) \ local_irq_restore(flags); \ __ret; \ }) #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) #endif /* __ASM_GENERIC_RQSPINLOCK_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 // SPDX-License-Identifier: GPL-2.0-only /* * IEEE802154.4 socket interface * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> */ #include <linux/net.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/if.h> #include <linux/termios.h> /* For TIOCOUTQ/INQ */ #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <net/datalink.h> #include <net/psnap.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/route.h> #include <net/af_ieee802154.h> #include <net/ieee802154_netdev.h> /* Utility function for families */ static struct net_device* ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) { struct net_device *dev = NULL; struct net_device *tmp; __le16 pan_id, short_addr; u8 hwaddr[IEEE802154_ADDR_LEN]; switch (addr->mode) { case IEEE802154_ADDR_LONG: ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr); rcu_read_lock(); dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr); dev_hold(dev); rcu_read_unlock(); break; case IEEE802154_ADDR_SHORT: if (addr->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST) || addr->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || addr->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) break; rtnl_lock(); for_each_netdev(net, tmp) { if (tmp->type != ARPHRD_IEEE802154) continue; pan_id = tmp->ieee802154_ptr->pan_id; short_addr = tmp->ieee802154_ptr->short_addr; if (pan_id == addr->pan_id && short_addr == addr->short_addr) { dev = tmp; dev_hold(dev); break; } } rtnl_unlock(); break; default: pr_warn("Unsupported ieee802154 address type: %d\n", addr->mode); break; } return dev; } static int ieee802154_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { sock->sk = NULL; sk->sk_prot->close(sk, 0); } return 0; } static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; return sk->sk_prot->sendmsg(sk, msg, len); } static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; if (sk->sk_prot->bind) return sk->sk_prot->bind(sk, uaddr, addr_len); return sock_no_bind(sock, uaddr, addr_len); } static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); return sk->sk_prot->connect(sk, uaddr, addr_len); } static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, unsigned int cmd) { struct ifreq ifr; int ret = -ENOIOCTLCMD; struct net_device *dev; if (get_user_ifreq(&ifr, NULL, arg)) return -EFAULT; ifr.ifr_name[IFNAMSIZ-1] = 0; dev_load(sock_net(sk), ifr.ifr_name); dev = dev_get_by_name(sock_net(sk), ifr.ifr_name); if (!dev) return -ENODEV; if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl) ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd); if (!ret && put_user_ifreq(&ifr, arg)) ret = -EFAULT; dev_put(dev); return ret; } static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCGIFADDR: case SIOCSIFADDR: return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg, cmd); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; return sk_ioctl(sk, cmd, (void __user *)arg); } } /* RAW Sockets (802.15.4 created in userspace) */ static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); write_unlock_bh(&raw_lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } static void raw_unhash(struct sock *sk) { write_lock_bh(&raw_lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&raw_lock); } static void raw_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) { struct ieee802154_addr addr; struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr; int err = 0; struct net_device *dev = NULL; err = ieee802154_sockaddr_check_size(uaddr, len); if (err < 0) return err; uaddr = (struct sockaddr_ieee802154 *)_uaddr; if (uaddr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); ieee802154_addr_from_sa(&addr, &uaddr->addr); dev = ieee802154_get_dev(sock_net(sk), &addr); if (!dev) { err = -ENODEV; goto out; } sk->sk_bound_dev_if = dev->ifindex; sk_dst_reset(sk); dev_put(dev); out: release_sock(sk); return err; } static int raw_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { return -ENOTSUPP; } static int raw_disconnect(struct sock *sk, int flags) { return 0; } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; struct sk_buff *skb; int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); return -EOPNOTSUPP; } lock_sock(sk); if (!sk->sk_bound_dev_if) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if); release_sock(sk); if (!dev) { pr_debug("no dev\n"); err = -ENXIO; goto out; } mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } if (!size) { err = 0; goto out_dev; } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; skb_reserve(skb, hlen); skb_reset_mac_header(skb); skb_reset_network_header(skb); err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); dev_put(dev); return err ?: size; out_skb: kfree_skb(skb); out_dev: dev_put(dev); out: return err; } static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } static void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk; read_lock(&raw_lock); sk_for_each(sk, &raw_head) { bh_lock_sock(sk); if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); if (clone) raw_rcv_skb(sk, clone); } bh_unlock_sock(sk); } read_unlock(&raw_lock); } static int raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { return -EOPNOTSUPP; } static int raw_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { return -EOPNOTSUPP; } static struct proto ieee802154_raw_prot = { .name = "IEEE-802.15.4-RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct sock), .close = raw_close, .bind = raw_bind, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .hash = raw_hash, .unhash = raw_unhash, .connect = raw_connect, .disconnect = raw_disconnect, .getsockopt = raw_getsockopt, .setsockopt = raw_setsockopt, }; static const struct proto_ops ieee802154_raw_ops = { .family = PF_IEEE802154, .owner = THIS_MODULE, .release = ieee802154_sock_release, .bind = ieee802154_sock_bind, .connect = ieee802154_sock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; /* DGRAM Sockets (802.15.4 dataframes) */ static HLIST_HEAD(dgram_head); static DEFINE_RWLOCK(dgram_lock); struct dgram_sock { struct sock sk; struct ieee802154_addr src_addr; struct ieee802154_addr dst_addr; unsigned int bound:1; unsigned int connected:1; unsigned int want_ack:1; unsigned int want_lqi:1; unsigned int secen:1; unsigned int secen_override:1; unsigned int seclevel:3; unsigned int seclevel_override:1; }; static inline struct dgram_sock *dgram_sk(const struct sock *sk) { return container_of(sk, struct dgram_sock, sk); } static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); write_unlock_bh(&dgram_lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } static void dgram_unhash(struct sock *sk) { write_lock_bh(&dgram_lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&dgram_lock); } static int dgram_init(struct sock *sk) { struct dgram_sock *ro = dgram_sk(sk); ro->want_ack = 1; ro->want_lqi = 0; return 0; } static void dgram_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct ieee802154_addr haddr; struct dgram_sock *ro = dgram_sk(sk); int err = -EINVAL; struct net_device *dev; lock_sock(sk); ro->bound = 0; err = ieee802154_sockaddr_check_size(addr, len); if (err < 0) goto out; if (addr->family != AF_IEEE802154) { err = -EINVAL; goto out; } ieee802154_addr_from_sa(&haddr, &addr->addr); dev = ieee802154_get_dev(sock_net(sk), &haddr); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_IEEE802154) { err = -ENODEV; goto out_put; } ro->src_addr = haddr; ro->bound = 1; err = 0; out_put: dev_put(dev); out: release_sock(sk); return err; } static int dgram_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; *karg = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { /* We will only return the amount * of this packet since that is all * that will be read. */ *karg = skb->len - ieee802154_hdr_length(skb); } spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } } return -ENOIOCTLCMD; } /* FIXME: autobind */ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct dgram_sock *ro = dgram_sk(sk); int err = 0; err = ieee802154_sockaddr_check_size(addr, len); if (err < 0) return err; if (addr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); if (!ro->bound) { err = -ENETUNREACH; goto out; } ieee802154_addr_from_sa(&ro->dst_addr, &addr->addr); ro->connected = 1; out: release_sock(sk); return err; } static int dgram_disconnect(struct sock *sk, int flags) { struct dgram_sock *ro = dgram_sk(sk); lock_sock(sk); ro->connected = 0; release_sock(sk); return 0; } static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; struct sk_buff *skb; struct ieee802154_mac_cb *cb; struct dgram_sock *ro = dgram_sk(sk); struct ieee802154_addr dst_addr; DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); return -EOPNOTSUPP; } if (msg->msg_name) { if (ro->connected) return -EISCONN; if (msg->msg_namelen < IEEE802154_MIN_NAMELEN) return -EINVAL; err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen); if (err < 0) return err; ieee802154_addr_from_sa(&dst_addr, &daddr->addr); } else { if (!ro->connected) return -EDESTADDRREQ; dst_addr = ro->dst_addr; } if (!ro->bound) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else dev = ieee802154_get_dev(sock_net(sk), &ro->src_addr); if (!dev) { pr_debug("no dev\n"); err = -ENXIO; goto out; } mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; skb_reserve(skb, hlen); skb_reset_network_header(skb); cb = mac_cb_init(skb); cb->type = IEEE802154_FC_TYPE_DATA; cb->ackreq = ro->want_ack; cb->secen = ro->secen; cb->secen_override = ro->secen_override; cb->seclevel = ro->seclevel; cb->seclevel_override = ro->seclevel_override; err = wpan_dev_hard_header(skb, dev, &dst_addr, ro->bound ? &ro->src_addr : NULL, size); if (err < 0) goto out_skb; err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); dev_put(dev); return err ?: size; out_skb: kfree_skb(skb); out_dev: dev_put(dev); out: return err; } static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; struct dgram_sock *ro = dgram_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* FIXME: skip headers if necessary ?! */ err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_cmsgs(msg, sk, skb); if (saddr) { /* Clear the implicit padding in struct sockaddr_ieee802154 * (16 bits between 'family' and 'addr') and in struct * ieee802154_addr_sa (16 bits at the end of the structure). */ memset(saddr, 0, sizeof(*saddr)); saddr->family = AF_IEEE802154; ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source); *addr_len = sizeof(*saddr); } if (ro->want_lqi) { err = put_cmsg(msg, SOL_IEEE802154, WPAN_WANTLQI, sizeof(uint8_t), &(mac_cb(skb)->lqi)); if (err) goto done; } if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } static inline bool ieee802154_match_sock(__le64 hw_addr, __le16 pan_id, __le16 short_addr, struct dgram_sock *ro) { if (!ro->bound) return true; if (ro->src_addr.mode == IEEE802154_ADDR_LONG && hw_addr == ro->src_addr.extended_addr) return true; if (ro->src_addr.mode == IEEE802154_ADDR_SHORT && pan_id == ro->src_addr.pan_id && short_addr == ro->src_addr.short_addr) return true; return false; } static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk, *prev = NULL; int ret = NET_RX_SUCCESS; __le16 pan_id, short_addr; __le64 hw_addr; /* Data frame processing */ BUG_ON(dev->type != ARPHRD_IEEE802154); pan_id = dev->ieee802154_ptr->pan_id; short_addr = dev->ieee802154_ptr->short_addr; hw_addr = dev->ieee802154_ptr->extended_addr; read_lock(&dgram_lock); sk_for_each(sk, &dgram_head) { if (ieee802154_match_sock(hw_addr, pan_id, short_addr, dgram_sk(sk))) { if (prev) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); if (clone) dgram_rcv_skb(prev, clone); } prev = sk; } } if (prev) { dgram_rcv_skb(prev, skb); } else { kfree_skb(skb); ret = NET_RX_DROP; } read_unlock(&dgram_lock); return ret; } static int dgram_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dgram_sock *ro = dgram_sk(sk); int val, len; if (level != SOL_IEEE802154) return -EOPNOTSUPP; if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case WPAN_WANTACK: val = ro->want_ack; break; case WPAN_WANTLQI: val = ro->want_lqi; break; case WPAN_SECURITY: if (!ro->secen_override) val = WPAN_SECURITY_DEFAULT; else if (ro->secen) val = WPAN_SECURITY_ON; else val = WPAN_SECURITY_OFF; break; case WPAN_SECURITY_LEVEL: if (!ro->seclevel_override) val = WPAN_SECURITY_LEVEL_DEFAULT; else val = ro->seclevel; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int dgram_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); int val; int err = 0; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); switch (optname) { case WPAN_WANTACK: ro->want_ack = !!val; break; case WPAN_WANTLQI: ro->want_lqi = !!val; break; case WPAN_SECURITY: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; break; } switch (val) { case WPAN_SECURITY_DEFAULT: ro->secen_override = 0; break; case WPAN_SECURITY_ON: ro->secen_override = 1; ro->secen = 1; break; case WPAN_SECURITY_OFF: ro->secen_override = 1; ro->secen = 0; break; default: err = -EINVAL; break; } break; case WPAN_SECURITY_LEVEL: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; break; } if (val < WPAN_SECURITY_LEVEL_DEFAULT || val > IEEE802154_SCF_SECLEVEL_ENC_MIC128) { err = -EINVAL; } else if (val == WPAN_SECURITY_LEVEL_DEFAULT) { ro->seclevel_override = 0; } else { ro->seclevel_override = 1; ro->seclevel = val; } break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static struct proto ieee802154_dgram_prot = { .name = "IEEE-802.15.4-MAC", .owner = THIS_MODULE, .obj_size = sizeof(struct dgram_sock), .init = dgram_init, .close = dgram_close, .bind = dgram_bind, .sendmsg = dgram_sendmsg, .recvmsg = dgram_recvmsg, .hash = dgram_hash, .unhash = dgram_unhash, .connect = dgram_connect, .disconnect = dgram_disconnect, .ioctl = dgram_ioctl, .getsockopt = dgram_getsockopt, .setsockopt = dgram_setsockopt, }; static const struct proto_ops ieee802154_dgram_ops = { .family = PF_IEEE802154, .owner = THIS_MODULE, .release = ieee802154_sock_release, .bind = ieee802154_sock_bind, .connect = ieee802154_sock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; static void ieee802154_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); } /* Create a socket. Initialise the socket, blank the addresses * set the state. */ static int ieee802154_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc; struct proto *proto; const struct proto_ops *ops; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; switch (sock->type) { case SOCK_RAW: rc = -EPERM; if (!capable(CAP_NET_RAW)) goto out; proto = &ieee802154_raw_prot; ops = &ieee802154_raw_ops; break; case SOCK_DGRAM: proto = &ieee802154_dgram_prot; ops = &ieee802154_dgram_ops; break; default: rc = -ESOCKTNOSUPPORT; goto out; } rc = -ENOMEM; sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern); if (!sk) goto out; rc = 0; sock->ops = ops; sock_init_data(sock, sk); sk->sk_destruct = ieee802154_sock_destruct; sk->sk_family = PF_IEEE802154; /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); if (sk->sk_prot->hash) { rc = sk->sk_prot->hash(sk); if (rc) goto out_sk_release; } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); if (rc) goto out_sk_release; } out: return rc; out_sk_release: sk_common_release(sk); sock->sk = NULL; goto out; } static const struct net_proto_family ieee802154_family_ops = { .family = PF_IEEE802154, .create = ieee802154_create, .owner = THIS_MODULE, }; static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (!netif_running(dev)) goto drop; pr_debug("got frame, type %d, dev %p\n", dev->type, dev); #ifdef DEBUG print_hex_dump_bytes("ieee802154_rcv ", DUMP_PREFIX_NONE, skb->data, skb->len); #endif if (!net_eq(dev_net(dev), &init_net)) goto drop; ieee802154_raw_deliver(dev, skb); if (dev->type != ARPHRD_IEEE802154) goto drop; if (skb->pkt_type != PACKET_OTHERHOST) return ieee802154_dgram_deliver(dev, skb); drop: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type ieee802154_packet_type = { .type = htons(ETH_P_IEEE802154), .func = ieee802154_rcv, }; static int __init af_ieee802154_init(void) { int rc; rc = proto_register(&ieee802154_raw_prot, 1); if (rc) goto out; rc = proto_register(&ieee802154_dgram_prot, 1); if (rc) goto err_dgram; /* Tell SOCKET that we are alive */ rc = sock_register(&ieee802154_family_ops); if (rc) goto err_sock; dev_add_pack(&ieee802154_packet_type); rc = 0; goto out; err_sock: proto_unregister(&ieee802154_dgram_prot); err_dgram: proto_unregister(&ieee802154_raw_prot); out: return rc; } static void __exit af_ieee802154_remove(void) { dev_remove_pack(&ieee802154_packet_type); sock_unregister(PF_IEEE802154); proto_unregister(&ieee802154_dgram_prot); proto_unregister(&ieee802154_raw_prot); } module_init(af_ieee802154_init); module_exit(af_ieee802154_remove); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IEEE 802.15.4 socket interface"); MODULE_ALIAS_NETPROTO(PF_IEEE802154);
9 9 9 9 8 9 7 7 5 1 4 1 3 1 2 1 1 1 2 2 1 1 1 7 6 6 6 2 4 1 3 2 1 1 6 4 6 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 // SPDX-License-Identifier: GPL-2.0 /* * Documentation/ABI/stable/sysfs-fs-orangefs: * * What: /sys/fs/orangefs/perf_counter_reset * Date: June 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * echo a 0 or a 1 into perf_counter_reset to * reset all the counters in * /sys/fs/orangefs/perf_counters * except ones with PINT_PERF_PRESERVE set. * * * What: /sys/fs/orangefs/perf_counters/... * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Counters and settings for various caches. * Read only. * * * What: /sys/fs/orangefs/perf_time_interval_secs * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Length of perf counter intervals in * seconds. * * * What: /sys/fs/orangefs/perf_history_size * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * The perf_counters cache statistics have N, or * perf_history_size, samples. The default is * one. * * Every perf_time_interval_secs the (first) * samples are reset. * * If N is greater than one, the "current" set * of samples is reset, and the samples from the * other N-1 intervals remain available. * * * What: /sys/fs/orangefs/op_timeout_secs * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Service operation timeout in seconds. * * * What: /sys/fs/orangefs/slot_timeout_secs * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * "Slot" timeout in seconds. A "slot" * is an indexed buffer in the shared * memory segment used for communication * between the kernel module and userspace. * Slots are requested and waited for, * the wait times out after slot_timeout_secs. * * What: /sys/fs/orangefs/cache_timeout_msecs * Date: Mar 2018 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Time in milliseconds between which * orangefs_revalidate_mapping will invalidate the page * cache. * * What: /sys/fs/orangefs/dcache_timeout_msecs * Date: Jul 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Time lookup is valid in milliseconds. * * What: /sys/fs/orangefs/getattr_timeout_msecs * Date: Jul 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Time getattr is valid in milliseconds. * * What: /sys/fs/orangefs/readahead_count * Date: Aug 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Readahead cache buffer count. * * What: /sys/fs/orangefs/readahead_size * Date: Aug 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Readahead cache buffer size. * * What: /sys/fs/orangefs/readahead_count_size * Date: Aug 2016 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Readahead cache buffer count and size. * * What: /sys/fs/orangefs/readahead_readcnt * Date: Jan 2017 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Number of buffers (in multiples of readahead_size) * which can be read ahead for a single file at once. * * What: /sys/fs/orangefs/acache/... * Date: Jun 2015 * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Attribute cache configurable settings. * * * What: /sys/fs/orangefs/ncache/... * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Name cache configurable settings. * * * What: /sys/fs/orangefs/capcache/... * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Capability cache configurable settings. * * * What: /sys/fs/orangefs/ccache/... * Date: Jun 2015 * Contact: Mike Marshall <hubcap@omnibond.com> * Description: * Credential cache configurable settings. * */ #include <linux/fs.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/module.h> #include <linux/init.h> #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-sysfs.h" #define ORANGEFS_KOBJ_ID "orangefs" #define ACACHE_KOBJ_ID "acache" #define CAPCACHE_KOBJ_ID "capcache" #define CCACHE_KOBJ_ID "ccache" #define NCACHE_KOBJ_ID "ncache" #define PC_KOBJ_ID "pc" #define STATS_KOBJ_ID "stats" /* * Every item calls orangefs_attr_show and orangefs_attr_store through * orangefs_sysfs_ops. They look at the orangefs_attributes further below to * call one of sysfs_int_show, sysfs_int_store, sysfs_service_op_show, or * sysfs_service_op_store. */ struct orangefs_attribute { struct attribute attr; ssize_t (*show)(struct kobject *kobj, struct orangefs_attribute *attr, char *buf); ssize_t (*store)(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count); }; static ssize_t orangefs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct orangefs_attribute *attribute; attribute = container_of(attr, struct orangefs_attribute, attr); if (!attribute->show) return -EIO; return attribute->show(kobj, attribute, buf); } static ssize_t orangefs_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct orangefs_attribute *attribute; if (!strcmp(kobj->name, PC_KOBJ_ID) || !strcmp(kobj->name, STATS_KOBJ_ID)) return -EPERM; attribute = container_of(attr, struct orangefs_attribute, attr); if (!attribute->store) return -EIO; return attribute->store(kobj, attribute, buf, len); } static const struct sysfs_ops orangefs_sysfs_ops = { .show = orangefs_attr_show, .store = orangefs_attr_store, }; static ssize_t sysfs_int_show(struct kobject *kobj, struct orangefs_attribute *attr, char *buf) { int rc = -EIO; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj->name); if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { if (!strcmp(attr->attr.name, "op_timeout_secs")) { rc = sysfs_emit(buf, "%d\n", op_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = sysfs_emit(buf, "%d\n", slot_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { rc = sysfs_emit(buf, "%d\n", orangefs_cache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = sysfs_emit(buf, "%d\n", orangefs_dcache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { rc = sysfs_emit(buf, "%d\n", orangefs_getattr_timeout_msecs); goto out; } else { goto out; } } else if (!strcmp(kobj->name, STATS_KOBJ_ID)) { if (!strcmp(attr->attr.name, "reads")) { rc = sysfs_emit(buf, "%lu\n", orangefs_stats.reads); goto out; } else if (!strcmp(attr->attr.name, "writes")) { rc = sysfs_emit(buf, "%lu\n", orangefs_stats.writes); goto out; } else { goto out; } } out: return rc; } static ssize_t sysfs_int_store(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count) { int rc = 0; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_store: start attr->attr.name:%s: buf:%s:\n", attr->attr.name, buf); if (!strcmp(attr->attr.name, "op_timeout_secs")) { rc = kstrtoint(buf, 0, &op_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = kstrtoint(buf, 0, &slot_timeout_secs); goto out; } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { rc = kstrtoint(buf, 0, &orangefs_cache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs); goto out; } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { rc = kstrtoint(buf, 0, &orangefs_getattr_timeout_msecs); goto out; } else { goto out; } out: if (rc) rc = -EINVAL; else rc = count; return rc; } /* * obtain attribute values from userspace with a service operation. */ static ssize_t sysfs_service_op_show(struct kobject *kobj, struct orangefs_attribute *attr, char *buf) { struct orangefs_kernel_op_s *new_op = NULL; int rc = 0; char *ser_op_type = NULL; __u32 op_alloc_type; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_service_op_show: id:%s:\n", kobj->name); if (strcmp(kobj->name, PC_KOBJ_ID)) op_alloc_type = ORANGEFS_VFS_OP_PARAM; else op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT; new_op = op_alloc(op_alloc_type); if (!new_op) return -ENOMEM; /* Can't do a service_operation if the client is not running... */ rc = is_daemon_in_service(); if (rc) { pr_info_ratelimited("%s: Client not running :%d:\n", __func__, is_daemon_in_service()); goto out; } if (strcmp(kobj->name, PC_KOBJ_ID)) new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET; if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { /* Drop unsupported requests first. */ if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && (!strcmp(attr->attr.name, "readahead_count") || !strcmp(attr->attr.name, "readahead_size") || !strcmp(attr->attr.name, "readahead_count_size") || !strcmp(attr->attr.name, "readahead_readcnt"))) { rc = -EINVAL; goto out; } if (!strcmp(attr->attr.name, "perf_history_size")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; else if (!strcmp(attr->attr.name, "perf_time_interval_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS; else if (!strcmp(attr->attr.name, "perf_counter_reset")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; else if (!strcmp(attr->attr.name, "readahead_count")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; else if (!strcmp(attr->attr.name, "readahead_size")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; else if (!strcmp(attr->attr.name, "readahead_count_size")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; else if (!strcmp(attr->attr.name, "readahead_readcnt")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_READCNT; } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "timeout_msecs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE; } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "timeout_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE; } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "timeout_secs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE; } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "timeout_msecs")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; if (!strcmp(attr->attr.name, "hard_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; if (!strcmp(attr->attr.name, "soft_limit")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; if (!strcmp(attr->attr.name, "reclaim_percentage")) new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE; } else if (!strcmp(kobj->name, PC_KOBJ_ID)) { if (!strcmp(attr->attr.name, ACACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_ACACHE; if (!strcmp(attr->attr.name, CAPCACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE; if (!strcmp(attr->attr.name, NCACHE_KOBJ_ID)) new_op->upcall.req.perf_count.type = ORANGEFS_PERF_COUNT_REQUEST_NCACHE; } else { gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n", kobj->name); rc = -EINVAL; goto out; } if (strcmp(kobj->name, PC_KOBJ_ID)) ser_op_type = "orangefs_param"; else ser_op_type = "orangefs_perf_count"; /* * The service_operation will return an errno return code on * error, and zero on success. */ rc = service_operation(new_op, ser_op_type, ORANGEFS_OP_INTERRUPTIBLE); out: if (!rc) { if (strcmp(kobj->name, PC_KOBJ_ID)) { if (new_op->upcall.req.param.op == ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) { rc = sysfs_emit(buf, "%d %d\n", (int)new_op->downcall.resp.param.u. value32[0], (int)new_op->downcall.resp.param.u. value32[1]); } else { rc = sysfs_emit(buf, "%d\n", (int)new_op->downcall.resp.param.u.value64); } } else { rc = sysfs_emit( buf, "%s", new_op->downcall.resp.perf_count.buffer); } } op_release(new_op); return rc; } /* * pass attribute values back to userspace with a service operation. * * We have to do a memory allocation, an sscanf and a service operation. * And we have to evaluate what the user entered, to make sure the * value is within the range supported by the attribute. So, there's * a lot of return code checking and mapping going on here. * * We want to return 1 if we think everything went OK, and * EINVAL if not. */ static ssize_t sysfs_service_op_store(struct kobject *kobj, struct orangefs_attribute *attr, const char *buf, size_t count) { struct orangefs_kernel_op_s *new_op = NULL; int val = 0; int rc = 0; gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_service_op_store: id:%s:\n", kobj->name); new_op = op_alloc(ORANGEFS_VFS_OP_PARAM); if (!new_op) return -EINVAL; /* sic */ /* Can't do a service_operation if the client is not running... */ rc = is_daemon_in_service(); if (rc) { pr_info("%s: Client not running :%d:\n", __func__, is_daemon_in_service()); goto out; } /* * The value we want to send back to userspace is in buf, unless this * there are two parameters, which is specially handled below. */ if (strcmp(kobj->name, ORANGEFS_KOBJ_ID) || strcmp(attr->attr.name, "readahead_count_size")) { rc = kstrtoint(buf, 0, &val); if (rc) goto out; } new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET; if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) { /* Drop unsupported requests first. */ if (!(orangefs_features & ORANGEFS_FEATURE_READAHEAD) && (!strcmp(attr->attr.name, "readahead_count") || !strcmp(attr->attr.name, "readahead_size") || !strcmp(attr->attr.name, "readahead_count_size") || !strcmp(attr->attr.name, "readahead_readcnt"))) { rc = -EINVAL; goto out; } if (!strcmp(attr->attr.name, "perf_history_size")) { if (val > 0) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "perf_time_interval_secs")) { if (val > 0) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "perf_counter_reset")) { if ((val == 0) || (val == 1)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_PERF_RESET; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "readahead_count")) { if ((val >= 0)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "readahead_size")) { if ((val >= 0)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "readahead_count_size")) { int val1, val2; rc = sscanf(buf, "%d %d", &val1, &val2); if (rc < 2) { rc = 0; goto out; } if ((val1 >= 0) && (val2 >= 0)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE; } else { rc = 0; goto out; } new_op->upcall.req.param.u.value32[0] = val1; new_op->upcall.req.param.u.value32[1] = val2; goto value_set; } else if (!strcmp(attr->attr.name, "readahead_readcnt")) { if ((val >= 0)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_READAHEAD_READCNT; } else { rc = 0; goto out; } } } else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "timeout_msecs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS; } else { rc = 0; goto out; } } } else if (!strcmp(kobj->name, CAPCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "timeout_secs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS; } else { rc = 0; goto out; } } } else if (!strcmp(kobj->name, CCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "timeout_secs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS; } else { rc = 0; goto out; } } } else if (!strcmp(kobj->name, NCACHE_KOBJ_ID)) { if (!strcmp(attr->attr.name, "hard_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "soft_limit")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "reclaim_percentage")) { if ((val > -1) && (val < 101)) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE; } else { rc = 0; goto out; } } else if (!strcmp(attr->attr.name, "timeout_msecs")) { if (val > -1) { new_op->upcall.req.param.op = ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS; } else { rc = 0; goto out; } } } else { gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n", kobj->name); rc = -EINVAL; goto out; } new_op->upcall.req.param.u.value64 = val; value_set: /* * The service_operation will return a errno return code on * error, and zero on success. */ rc = service_operation(new_op, "orangefs_param", ORANGEFS_OP_INTERRUPTIBLE); if (rc < 0) { gossip_err("sysfs_service_op_store: service op returned:%d:\n", rc); rc = 0; } else { rc = count; } out: op_release(new_op); if (rc == -ENOMEM || rc == 0) rc = -EINVAL; return rc; } static struct orangefs_attribute op_timeout_secs_attribute = __ATTR(op_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute slot_timeout_secs_attribute = __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute cache_timeout_msecs_attribute = __ATTR(cache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute dcache_timeout_msecs_attribute = __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute getattr_timeout_msecs_attribute = __ATTR(getattr_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); static struct orangefs_attribute readahead_count_attribute = __ATTR(readahead_count, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute readahead_size_attribute = __ATTR(readahead_size, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute readahead_count_size_attribute = __ATTR(readahead_count_size, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute readahead_readcnt_attribute = __ATTR(readahead_readcnt, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute perf_counter_reset_attribute = __ATTR(perf_counter_reset, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute perf_history_size_attribute = __ATTR(perf_history_size, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute perf_time_interval_secs_attribute = __ATTR(perf_time_interval_secs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *orangefs_default_attrs[] = { &op_timeout_secs_attribute.attr, &slot_timeout_secs_attribute.attr, &cache_timeout_msecs_attribute.attr, &dcache_timeout_msecs_attribute.attr, &getattr_timeout_msecs_attribute.attr, &readahead_count_attribute.attr, &readahead_size_attribute.attr, &readahead_count_size_attribute.attr, &readahead_readcnt_attribute.attr, &perf_counter_reset_attribute.attr, &perf_history_size_attribute.attr, &perf_time_interval_secs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(orangefs_default); static struct kobject *orangefs_obj; static void orangefs_obj_release(struct kobject *kobj) { kfree(orangefs_obj); orangefs_obj = NULL; } static const struct kobj_type orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = orangefs_default_groups, .release = orangefs_obj_release, }; static struct orangefs_attribute acache_hard_limit_attribute = __ATTR(hard_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute acache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute acache_soft_limit_attribute = __ATTR(soft_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute acache_timeout_msecs_attribute = __ATTR(timeout_msecs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *acache_orangefs_default_attrs[] = { &acache_hard_limit_attribute.attr, &acache_reclaim_percent_attribute.attr, &acache_soft_limit_attribute.attr, &acache_timeout_msecs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(acache_orangefs_default); static struct kobject *acache_orangefs_obj; static void acache_orangefs_obj_release(struct kobject *kobj) { kfree(acache_orangefs_obj); acache_orangefs_obj = NULL; } static const struct kobj_type acache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = acache_orangefs_default_groups, .release = acache_orangefs_obj_release, }; static struct orangefs_attribute capcache_hard_limit_attribute = __ATTR(hard_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute capcache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute capcache_soft_limit_attribute = __ATTR(soft_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute capcache_timeout_secs_attribute = __ATTR(timeout_secs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *capcache_orangefs_default_attrs[] = { &capcache_hard_limit_attribute.attr, &capcache_reclaim_percent_attribute.attr, &capcache_soft_limit_attribute.attr, &capcache_timeout_secs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(capcache_orangefs_default); static struct kobject *capcache_orangefs_obj; static void capcache_orangefs_obj_release(struct kobject *kobj) { kfree(capcache_orangefs_obj); capcache_orangefs_obj = NULL; } static const struct kobj_type capcache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = capcache_orangefs_default_groups, .release = capcache_orangefs_obj_release, }; static struct orangefs_attribute ccache_hard_limit_attribute = __ATTR(hard_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ccache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ccache_soft_limit_attribute = __ATTR(soft_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ccache_timeout_secs_attribute = __ATTR(timeout_secs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *ccache_orangefs_default_attrs[] = { &ccache_hard_limit_attribute.attr, &ccache_reclaim_percent_attribute.attr, &ccache_soft_limit_attribute.attr, &ccache_timeout_secs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(ccache_orangefs_default); static struct kobject *ccache_orangefs_obj; static void ccache_orangefs_obj_release(struct kobject *kobj) { kfree(ccache_orangefs_obj); ccache_orangefs_obj = NULL; } static const struct kobj_type ccache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ccache_orangefs_default_groups, .release = ccache_orangefs_obj_release, }; static struct orangefs_attribute ncache_hard_limit_attribute = __ATTR(hard_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ncache_reclaim_percent_attribute = __ATTR(reclaim_percentage, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ncache_soft_limit_attribute = __ATTR(soft_limit, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct orangefs_attribute ncache_timeout_msecs_attribute = __ATTR(timeout_msecs, 0664, sysfs_service_op_show, sysfs_service_op_store); static struct attribute *ncache_orangefs_default_attrs[] = { &ncache_hard_limit_attribute.attr, &ncache_reclaim_percent_attribute.attr, &ncache_soft_limit_attribute.attr, &ncache_timeout_msecs_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(ncache_orangefs_default); static struct kobject *ncache_orangefs_obj; static void ncache_orangefs_obj_release(struct kobject *kobj) { kfree(ncache_orangefs_obj); ncache_orangefs_obj = NULL; } static const struct kobj_type ncache_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = ncache_orangefs_default_groups, .release = ncache_orangefs_obj_release, }; static struct orangefs_attribute pc_acache_attribute = __ATTR(acache, 0664, sysfs_service_op_show, NULL); static struct orangefs_attribute pc_capcache_attribute = __ATTR(capcache, 0664, sysfs_service_op_show, NULL); static struct orangefs_attribute pc_ncache_attribute = __ATTR(ncache, 0664, sysfs_service_op_show, NULL); static struct attribute *pc_orangefs_default_attrs[] = { &pc_acache_attribute.attr, &pc_capcache_attribute.attr, &pc_ncache_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(pc_orangefs_default); static struct kobject *pc_orangefs_obj; static void pc_orangefs_obj_release(struct kobject *kobj) { kfree(pc_orangefs_obj); pc_orangefs_obj = NULL; } static const struct kobj_type pc_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = pc_orangefs_default_groups, .release = pc_orangefs_obj_release, }; static struct orangefs_attribute stats_reads_attribute = __ATTR(reads, 0664, sysfs_int_show, NULL); static struct orangefs_attribute stats_writes_attribute = __ATTR(writes, 0664, sysfs_int_show, NULL); static struct attribute *stats_orangefs_default_attrs[] = { &stats_reads_attribute.attr, &stats_writes_attribute.attr, NULL, }; ATTRIBUTE_GROUPS(stats_orangefs_default); static struct kobject *stats_orangefs_obj; static void stats_orangefs_obj_release(struct kobject *kobj) { kfree(stats_orangefs_obj); stats_orangefs_obj = NULL; } static const struct kobj_type stats_orangefs_ktype = { .sysfs_ops = &orangefs_sysfs_ops, .default_groups = stats_orangefs_default_groups, .release = stats_orangefs_obj_release, }; int orangefs_sysfs_init(void) { int rc = -EINVAL; gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_init: start\n"); /* create /sys/fs/orangefs. */ orangefs_obj = kzalloc(sizeof(*orangefs_obj), GFP_KERNEL); if (!orangefs_obj) goto out; rc = kobject_init_and_add(orangefs_obj, &orangefs_ktype, fs_kobj, ORANGEFS_KOBJ_ID); if (rc) goto ofs_obj_bail; kobject_uevent(orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/acache. */ acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL); if (!acache_orangefs_obj) { rc = -EINVAL; goto ofs_obj_bail; } rc = kobject_init_and_add(acache_orangefs_obj, &acache_orangefs_ktype, orangefs_obj, ACACHE_KOBJ_ID); if (rc) goto acache_obj_bail; kobject_uevent(acache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/capcache. */ capcache_orangefs_obj = kzalloc(sizeof(*capcache_orangefs_obj), GFP_KERNEL); if (!capcache_orangefs_obj) { rc = -EINVAL; goto acache_obj_bail; } rc = kobject_init_and_add(capcache_orangefs_obj, &capcache_orangefs_ktype, orangefs_obj, CAPCACHE_KOBJ_ID); if (rc) goto capcache_obj_bail; kobject_uevent(capcache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/ccache. */ ccache_orangefs_obj = kzalloc(sizeof(*ccache_orangefs_obj), GFP_KERNEL); if (!ccache_orangefs_obj) { rc = -EINVAL; goto capcache_obj_bail; } rc = kobject_init_and_add(ccache_orangefs_obj, &ccache_orangefs_ktype, orangefs_obj, CCACHE_KOBJ_ID); if (rc) goto ccache_obj_bail; kobject_uevent(ccache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/ncache. */ ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL); if (!ncache_orangefs_obj) { rc = -EINVAL; goto ccache_obj_bail; } rc = kobject_init_and_add(ncache_orangefs_obj, &ncache_orangefs_ktype, orangefs_obj, NCACHE_KOBJ_ID); if (rc) goto ncache_obj_bail; kobject_uevent(ncache_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/perf_counters. */ pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL); if (!pc_orangefs_obj) { rc = -EINVAL; goto ncache_obj_bail; } rc = kobject_init_and_add(pc_orangefs_obj, &pc_orangefs_ktype, orangefs_obj, "perf_counters"); if (rc) goto pc_obj_bail; kobject_uevent(pc_orangefs_obj, KOBJ_ADD); /* create /sys/fs/orangefs/stats. */ stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL); if (!stats_orangefs_obj) { rc = -EINVAL; goto pc_obj_bail; } rc = kobject_init_and_add(stats_orangefs_obj, &stats_orangefs_ktype, orangefs_obj, STATS_KOBJ_ID); if (rc) goto stats_obj_bail; kobject_uevent(stats_orangefs_obj, KOBJ_ADD); goto out; stats_obj_bail: kobject_put(stats_orangefs_obj); pc_obj_bail: kobject_put(pc_orangefs_obj); ncache_obj_bail: kobject_put(ncache_orangefs_obj); ccache_obj_bail: kobject_put(ccache_orangefs_obj); capcache_obj_bail: kobject_put(capcache_orangefs_obj); acache_obj_bail: kobject_put(acache_orangefs_obj); ofs_obj_bail: kobject_put(orangefs_obj); out: return rc; } void orangefs_sysfs_exit(void) { gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n"); kobject_put(acache_orangefs_obj); kobject_put(capcache_orangefs_obj); kobject_put(ccache_orangefs_obj); kobject_put(ncache_orangefs_obj); kobject_put(pc_orangefs_obj); kobject_put(stats_orangefs_obj); kobject_put(orangefs_obj); }
6 6 1 1 5 5 1 6 5 1 6 6 5 12 12 12 12 12 13 13 13 3 3 3 12 12 13 12 12 12 11 12 12 12 12 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 13 13 13 13 13 13 13 3 3 1 1 3 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 13 13 13 13 13 13 13 13 13 13 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/svc.c * * High-level RPC service routines * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> * * Multiple threads pools and NUMAisation * Copyright (c) 2006 Silicon Graphics, Inc. * by Greg Banks <gnb@melbourne.sgi.com> */ #include <linux/linkage.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/net.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/slab.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/bc_xprt.h> #include <trace/events/sunrpc.h> #include "fail.h" #include "sunrpc.h" #define RPCDBG_FACILITY RPCDBG_SVCDSP static void svc_unregister(const struct svc_serv *serv, struct net *net); #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL /* * Mode for mapping cpus to pools. */ enum { SVC_POOL_AUTO = -1, /* choose one of the others */ SVC_POOL_GLOBAL, /* no mapping, just a single global pool * (legacy & UP mode) */ SVC_POOL_PERCPU, /* one pool per cpu */ SVC_POOL_PERNODE /* one pool per numa node */ }; /* * Structure for mapping cpus to pools and vice versa. * Setup once during sunrpc initialisation. */ struct svc_pool_map { int count; /* How many svc_servs use us */ int mode; /* Note: int not enum to avoid * warnings about "enumeration value * not handled in switch" */ unsigned int npools; unsigned int *pool_to; /* maps pool id to cpu or node */ unsigned int *to_pool; /* maps cpu or node to pool id */ }; static struct svc_pool_map svc_pool_map = { .mode = SVC_POOL_DEFAULT }; static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */ static int __param_set_pool_mode(const char *val, struct svc_pool_map *m) { int err, mode; mutex_lock(&svc_pool_map_mutex); err = 0; if (!strncmp(val, "auto", 4)) mode = SVC_POOL_AUTO; else if (!strncmp(val, "global", 6)) mode = SVC_POOL_GLOBAL; else if (!strncmp(val, "percpu", 6)) mode = SVC_POOL_PERCPU; else if (!strncmp(val, "pernode", 7)) mode = SVC_POOL_PERNODE; else err = -EINVAL; if (err) goto out; if (m->count == 0) m->mode = mode; else if (mode != m->mode) err = -EBUSY; out: mutex_unlock(&svc_pool_map_mutex); return err; } static int param_set_pool_mode(const char *val, const struct kernel_param *kp) { struct svc_pool_map *m = kp->arg; return __param_set_pool_mode(val, m); } int sunrpc_set_pool_mode(const char *val) { return __param_set_pool_mode(val, &svc_pool_map); } EXPORT_SYMBOL(sunrpc_set_pool_mode); /** * sunrpc_get_pool_mode - get the current pool_mode for the host * @buf: where to write the current pool_mode * @size: size of @buf * * Grab the current pool_mode from the svc_pool_map and write * the resulting string to @buf. Returns the number of characters * written to @buf (a'la snprintf()). */ int sunrpc_get_pool_mode(char *buf, size_t size) { struct svc_pool_map *m = &svc_pool_map; switch (m->mode) { case SVC_POOL_AUTO: return snprintf(buf, size, "auto"); case SVC_POOL_GLOBAL: return snprintf(buf, size, "global"); case SVC_POOL_PERCPU: return snprintf(buf, size, "percpu"); case SVC_POOL_PERNODE: return snprintf(buf, size, "pernode"); default: return snprintf(buf, size, "%d", m->mode); } } EXPORT_SYMBOL(sunrpc_get_pool_mode); static int param_get_pool_mode(char *buf, const struct kernel_param *kp) { char str[16]; int len; len = sunrpc_get_pool_mode(str, ARRAY_SIZE(str)); /* Ensure we have room for newline and NUL */ len = min_t(int, len, ARRAY_SIZE(str) - 2); /* tack on the newline */ str[len] = '\n'; str[len + 1] = '\0'; return sysfs_emit(buf, "%s", str); } module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode, &svc_pool_map, 0644); /* * Detect best pool mapping mode heuristically, * according to the machine's topology. */ static int svc_pool_map_choose_mode(void) { unsigned int node; if (nr_online_nodes > 1) { /* * Actually have multiple NUMA nodes, * so split pools on NUMA node boundaries */ return SVC_POOL_PERNODE; } node = first_online_node; if (nr_cpus_node(node) > 2) { /* * Non-trivial SMP, or CONFIG_NUMA on * non-NUMA hardware, e.g. with a generic * x86_64 kernel on Xeons. In this case we * want to divide the pools on cpu boundaries. */ return SVC_POOL_PERCPU; } /* default: one global pool */ return SVC_POOL_GLOBAL; } /* * Allocate the to_pool[] and pool_to[] arrays. * Returns 0 on success or an errno. */ static int svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools) { m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); if (!m->to_pool) goto fail; m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); if (!m->pool_to) goto fail_free; return 0; fail_free: kfree(m->to_pool); m->to_pool = NULL; fail: return -ENOMEM; } /* * Initialise the pool map for SVC_POOL_PERCPU mode. * Returns number of pools or <0 on error. */ static int svc_pool_map_init_percpu(struct svc_pool_map *m) { unsigned int maxpools = nr_cpu_ids; unsigned int pidx = 0; unsigned int cpu; int err; err = svc_pool_map_alloc_arrays(m, maxpools); if (err) return err; for_each_online_cpu(cpu) { BUG_ON(pidx >= maxpools); m->to_pool[cpu] = pidx; m->pool_to[pidx] = cpu; pidx++; } /* cpus brought online later all get mapped to pool0, sorry */ return pidx; }; /* * Initialise the pool map for SVC_POOL_PERNODE mode. * Returns number of pools or <0 on error. */ static int svc_pool_map_init_pernode(struct svc_pool_map *m) { unsigned int maxpools = nr_node_ids; unsigned int pidx = 0; unsigned int node; int err; err = svc_pool_map_alloc_arrays(m, maxpools); if (err) return err; for_each_node_with_cpus(node) { /* some architectures (e.g. SN2) have cpuless nodes */ BUG_ON(pidx > maxpools); m->to_pool[node] = pidx; m->pool_to[pidx] = node; pidx++; } /* nodes brought online later all get mapped to pool0, sorry */ return pidx; } /* * Add a reference to the global map of cpus to pools (and * vice versa) if pools are in use. * Initialise the map if we're the first user. * Returns the number of pools. If this is '1', no reference * was taken. */ static unsigned int svc_pool_map_get(void) { struct svc_pool_map *m = &svc_pool_map; int npools = -1; mutex_lock(&svc_pool_map_mutex); if (m->count++) { mutex_unlock(&svc_pool_map_mutex); return m->npools; } if (m->mode == SVC_POOL_AUTO) m->mode = svc_pool_map_choose_mode(); switch (m->mode) { case SVC_POOL_PERCPU: npools = svc_pool_map_init_percpu(m); break; case SVC_POOL_PERNODE: npools = svc_pool_map_init_pernode(m); break; } if (npools <= 0) { /* default, or memory allocation failure */ npools = 1; m->mode = SVC_POOL_GLOBAL; } m->npools = npools; mutex_unlock(&svc_pool_map_mutex); return npools; } /* * Drop a reference to the global map of cpus to pools. * When the last reference is dropped, the map data is * freed; this allows the sysadmin to change the pool. */ static void svc_pool_map_put(void) { struct svc_pool_map *m = &svc_pool_map; mutex_lock(&svc_pool_map_mutex); if (!--m->count) { kfree(m->to_pool); m->to_pool = NULL; kfree(m->pool_to); m->pool_to = NULL; m->npools = 0; } mutex_unlock(&svc_pool_map_mutex); } static int svc_pool_map_get_node(unsigned int pidx) { const struct svc_pool_map *m = &svc_pool_map; if (m->count) { if (m->mode == SVC_POOL_PERCPU) return cpu_to_node(m->pool_to[pidx]); if (m->mode == SVC_POOL_PERNODE) return m->pool_to[pidx]; } return numa_mem_id(); } /* * Set the given thread's cpus_allowed mask so that it * will only run on cpus in the given pool. */ static inline void svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) { struct svc_pool_map *m = &svc_pool_map; unsigned int node = m->pool_to[pidx]; /* * The caller checks for sv_nrpools > 1, which * implies that we've been initialized. */ WARN_ON_ONCE(m->count == 0); if (m->count == 0) return; switch (m->mode) { case SVC_POOL_PERCPU: { set_cpus_allowed_ptr(task, cpumask_of(node)); break; } case SVC_POOL_PERNODE: { set_cpus_allowed_ptr(task, cpumask_of_node(node)); break; } } } /** * svc_pool_for_cpu - Select pool to run a thread on this cpu * @serv: An RPC service * * Use the active CPU and the svc_pool_map's mode setting to * select the svc thread pool to use. Once initialized, the * svc_pool_map does not change. * * Return value: * A pointer to an svc_pool */ struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv) { struct svc_pool_map *m = &svc_pool_map; int cpu = raw_smp_processor_id(); unsigned int pidx = 0; if (serv->sv_nrpools <= 1) return serv->sv_pools; switch (m->mode) { case SVC_POOL_PERCPU: pidx = m->to_pool[cpu]; break; case SVC_POOL_PERNODE: pidx = m->to_pool[cpu_to_node(cpu)]; break; } return &serv->sv_pools[pidx % serv->sv_nrpools]; } static int svc_rpcb_setup(struct svc_serv *serv, struct net *net) { int err; err = rpcb_create_local(net); if (err) return err; /* Remove any stale portmap registrations */ svc_unregister(serv, net); return 0; } void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) { svc_unregister(serv, net); rpcb_put_local(net); } static int svc_uses_rpcbind(struct svc_serv *serv) { unsigned int p, i; for (p = 0; p < serv->sv_nprogs; p++) { struct svc_program *progp = &serv->sv_programs[p]; for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; if (!progp->pg_vers[i]->vs_hidden) return 1; } } return 0; } int svc_bind(struct svc_serv *serv, struct net *net) { if (!svc_uses_rpcbind(serv)) return 0; return svc_rpcb_setup(serv, net); } EXPORT_SYMBOL_GPL(svc_bind); #if defined(CONFIG_SUNRPC_BACKCHANNEL) static void __svc_init_bc(struct svc_serv *serv) { lwq_init(&serv->sv_cb_list); } #else static void __svc_init_bc(struct svc_serv *serv) { } #endif /* * Create an RPC service */ static struct svc_serv * __svc_create(struct svc_program *prog, int nprogs, struct svc_stat *stats, unsigned int bufsize, int npools, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int vers; unsigned int xdrsize; unsigned int i; if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; serv->sv_name = prog->pg_name; serv->sv_programs = prog; serv->sv_nprogs = nprogs; serv->sv_stats = stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE); serv->sv_threadfn = threadfn; xdrsize = 0; for (i = 0; i < nprogs; i++) { struct svc_program *progp = &prog[i]; progp->pg_lovers = progp->pg_nvers-1; for (vers = 0; vers < progp->pg_nvers ; vers++) if (progp->pg_vers[vers]) { progp->pg_hivers = vers; if (progp->pg_lovers > vers) progp->pg_lovers = vers; if (progp->pg_vers[vers]->vs_xdrsize > xdrsize) xdrsize = progp->pg_vers[vers]->vs_xdrsize; } } serv->sv_xdrsize = xdrsize; INIT_LIST_HEAD(&serv->sv_tempsocks); INIT_LIST_HEAD(&serv->sv_permsocks); timer_setup(&serv->sv_temptimer, NULL, 0); spin_lock_init(&serv->sv_lock); __svc_init_bc(serv); serv->sv_nrpools = npools; serv->sv_pools = kcalloc(serv->sv_nrpools, sizeof(struct svc_pool), GFP_KERNEL); if (!serv->sv_pools) { kfree(serv); return NULL; } for (i = 0; i < serv->sv_nrpools; i++) { struct svc_pool *pool = &serv->sv_pools[i]; dprintk("svc: initialising pool %u for %s\n", i, serv->sv_name); pool->sp_id = i; lwq_init(&pool->sp_xprts); INIT_LIST_HEAD(&pool->sp_all_threads); init_llist_head(&pool->sp_idle_threads); percpu_counter_init(&pool->sp_messages_arrived, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_sockets_queued, 0, GFP_KERNEL); percpu_counter_init(&pool->sp_threads_woken, 0, GFP_KERNEL); } return serv; } /** * svc_create - Create an RPC service * @prog: the RPC program the new service will handle * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize, int (*threadfn)(void *data)) { return __svc_create(prog, 1, NULL, bufsize, 1, threadfn); } EXPORT_SYMBOL_GPL(svc_create); /** * svc_create_pooled - Create an RPC service with pooled threads * @prog: Array of RPC programs the new service will handle * @nprogs: Number of programs in the array * @stats: the stats struct if desired * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create_pooled(struct svc_program *prog, unsigned int nprogs, struct svc_stat *stats, unsigned int bufsize, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); serv = __svc_create(prog, nprogs, stats, bufsize, npools, threadfn); if (!serv) goto out_err; serv->sv_is_pooled = true; return serv; out_err: svc_pool_map_put(); return NULL; } EXPORT_SYMBOL_GPL(svc_create_pooled); /* * Destroy an RPC service. Should be called with appropriate locking to * protect sv_permsocks and sv_tempsocks. */ void svc_destroy(struct svc_serv **servp) { struct svc_serv *serv = *servp; unsigned int i; *servp = NULL; dprintk("svc: svc_destroy(%s)\n", serv->sv_programs->pg_name); timer_shutdown_sync(&serv->sv_temptimer); /* * Remaining transports at this point are not expected. */ WARN_ONCE(!list_empty(&serv->sv_permsocks), "SVC: permsocks remain for %s\n", serv->sv_programs->pg_name); WARN_ONCE(!list_empty(&serv->sv_tempsocks), "SVC: tempsocks remain for %s\n", serv->sv_programs->pg_name); cache_clean_deferred(serv); if (serv->sv_is_pooled) svc_pool_map_put(); for (i = 0; i < serv->sv_nrpools; i++) { struct svc_pool *pool = &serv->sv_pools[i]; percpu_counter_destroy(&pool->sp_messages_arrived); percpu_counter_destroy(&pool->sp_sockets_queued); percpu_counter_destroy(&pool->sp_threads_woken); } kfree(serv->sv_pools); kfree(serv); } EXPORT_SYMBOL_GPL(svc_destroy); static bool svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { rqstp->rq_maxpages = svc_serv_maxpages(serv); /* rq_pages' last entry is NULL for historical reasons. */ rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, sizeof(struct page *), GFP_KERNEL, node); if (!rqstp->rq_pages) return false; return true; } /* * Release an RPC server buffer */ static void svc_release_buffer(struct svc_rqst *rqstp) { unsigned long i; for (i = 0; i < rqstp->rq_maxpages; i++) if (rqstp->rq_pages[i]) put_page(rqstp->rq_pages[i]); kfree(rqstp->rq_pages); } static void svc_rqst_free(struct svc_rqst *rqstp) { folio_batch_release(&rqstp->rq_fbatch); kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); if (rqstp->rq_scratch_folio) folio_put(rqstp->rq_scratch_folio); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); kfree_rcu(rqstp, rq_rcu_head); } static struct svc_rqst * svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) { struct svc_rqst *rqstp; rqstp = kzalloc_node(sizeof(*rqstp), GFP_KERNEL, node); if (!rqstp) return rqstp; folio_batch_init(&rqstp->rq_fbatch); rqstp->rq_server = serv; rqstp->rq_pool = pool; rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node); if (!rqstp->rq_scratch_folio) goto out_enomem; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) goto out_enomem; rqstp->rq_resp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_resp) goto out_enomem; if (!svc_init_buffer(rqstp, serv, node)) goto out_enomem; rqstp->rq_bvec = kcalloc_node(rqstp->rq_maxpages, sizeof(struct bio_vec), GFP_KERNEL, node); if (!rqstp->rq_bvec) goto out_enomem; rqstp->rq_err = -EAGAIN; /* No error yet */ serv->sv_nrthreads += 1; pool->sp_nrthreads += 1; /* Protected by whatever lock the service uses when calling * svc_set_num_threads() */ list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); return rqstp; out_enomem: svc_rqst_free(rqstp); return NULL; } /** * svc_pool_wake_idle_thread - Awaken an idle thread in @pool * @pool: service thread pool * * Can be called from soft IRQ or process context. Finding an idle * service thread and marking it BUSY is atomic with respect to * other calls to svc_pool_wake_idle_thread(). * */ void svc_pool_wake_idle_thread(struct svc_pool *pool) { struct svc_rqst *rqstp; struct llist_node *ln; rcu_read_lock(); ln = READ_ONCE(pool->sp_idle_threads.first); if (ln) { rqstp = llist_entry(ln, struct svc_rqst, rq_idle); WRITE_ONCE(rqstp->rq_qtime, ktime_get()); if (!task_is_running(rqstp->rq_task)) { wake_up_process(rqstp->rq_task); trace_svc_pool_thread_wake(pool, rqstp->rq_task->pid); percpu_counter_inc(&pool->sp_threads_woken); } else { trace_svc_pool_thread_running(pool, rqstp->rq_task->pid); } rcu_read_unlock(); return; } rcu_read_unlock(); trace_svc_pool_thread_noidle(pool, 0); } EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread); static struct svc_pool * svc_pool_next(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) { return pool ? pool : &serv->sv_pools[(*state)++ % serv->sv_nrpools]; } static struct svc_pool * svc_pool_victim(struct svc_serv *serv, struct svc_pool *target_pool, unsigned int *state) { struct svc_pool *pool; unsigned int i; pool = target_pool; if (!pool) { for (i = 0; i < serv->sv_nrpools; i++) { pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; if (pool->sp_nrthreads) break; } } if (pool && pool->sp_nrthreads) { set_bit(SP_VICTIM_REMAINS, &pool->sp_flags); set_bit(SP_NEED_VICTIM, &pool->sp_flags); return pool; } return NULL; } static int svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { struct svc_rqst *rqstp; struct task_struct *task; struct svc_pool *chosen_pool; unsigned int state = serv->sv_nrthreads-1; int node; int err; do { nrservs--; chosen_pool = svc_pool_next(serv, pool, &state); node = svc_pool_map_get_node(chosen_pool->sp_id); rqstp = svc_prepare_thread(serv, chosen_pool, node); if (!rqstp) return -ENOMEM; task = kthread_create_on_node(serv->sv_threadfn, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { svc_exit_thread(rqstp); return PTR_ERR(task); } rqstp->rq_task = task; if (serv->sv_nrpools > 1) svc_pool_map_set_cpumask(task, chosen_pool->sp_id); svc_sock_update_bufs(serv); wake_up_process(task); wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN); err = rqstp->rq_err; if (err) { svc_exit_thread(rqstp); return err; } } while (nrservs > 0); return 0; } static int svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { unsigned int state = serv->sv_nrthreads-1; struct svc_pool *victim; do { victim = svc_pool_victim(serv, pool, &state); if (!victim) break; svc_pool_wake_idle_thread(victim); wait_on_bit(&victim->sp_flags, SP_VICTIM_REMAINS, TASK_IDLE); nrservs++; } while (nrservs < 0); return 0; } /** * svc_set_num_threads - adjust number of threads per RPC service * @serv: RPC service to adjust * @pool: Specific pool from which to choose threads, or NULL * @nrservs: New number of threads for @serv (0 or less means kill all threads) * * Create or destroy threads to make the number of threads for @serv the * given number. If @pool is non-NULL, change only threads in that pool; * otherwise, round-robin between all pools for @serv. @serv's * sv_nrthreads is adjusted for each thread created or destroyed. * * Caller must ensure mutual exclusion between this and server startup or * shutdown. * * Returns zero on success or a negative errno if an error occurred while * starting a thread. */ int svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { if (!pool) nrservs -= serv->sv_nrthreads; else nrservs -= pool->sp_nrthreads; if (nrservs > 0) return svc_start_kthreads(serv, pool, nrservs); if (nrservs < 0) return svc_stop_kthreads(serv, pool, nrservs); return 0; } EXPORT_SYMBOL_GPL(svc_set_num_threads); /** * svc_rqst_replace_page - Replace one page in rq_pages[] * @rqstp: svc_rqst with pages to replace * @page: replacement page * * When replacing a page in rq_pages, batch the release of the * replaced pages to avoid hammering the page allocator. * * Return values: * %true: page replaced * %false: array bounds checking failed */ bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { struct page **begin = rqstp->rq_pages; struct page **end = &rqstp->rq_pages[rqstp->rq_maxpages]; if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { trace_svc_replace_page_err(rqstp); return false; } if (*rqstp->rq_next_page) { if (!folio_batch_add(&rqstp->rq_fbatch, page_folio(*rqstp->rq_next_page))) __folio_batch_release(&rqstp->rq_fbatch); } get_page(page); *(rqstp->rq_next_page++) = page; return true; } EXPORT_SYMBOL_GPL(svc_rqst_replace_page); /** * svc_rqst_release_pages - Release Reply buffer pages * @rqstp: RPC transaction context * * Release response pages that might still be in flight after * svc_send, and any spliced filesystem-owned pages. */ void svc_rqst_release_pages(struct svc_rqst *rqstp) { int i, count = rqstp->rq_next_page - rqstp->rq_respages; if (count) { release_pages(rqstp->rq_respages, count); for (i = 0; i < count; i++) rqstp->rq_respages[i] = NULL; } } /** * svc_exit_thread - finalise the termination of a sunrpc server thread * @rqstp: the svc_rqst which represents the thread. * * When a thread started with svc_new_thread() exits it must call * svc_exit_thread() as its last act. This must be done with the * service mutex held. Normally this is held by a DIFFERENT thread, the * one that is calling svc_set_num_threads() and which will wait for * SP_VICTIM_REMAINS to be cleared before dropping the mutex. If the * thread exits for any reason other than svc_thread_should_stop() * returning %true (which indicated that svc_set_num_threads() is * waiting for it to exit), then it must take the service mutex itself, * which can only safely be done using mutex_try_lock(). */ void svc_exit_thread(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct svc_pool *pool = rqstp->rq_pool; list_del_rcu(&rqstp->rq_all); pool->sp_nrthreads -= 1; serv->sv_nrthreads -= 1; svc_sock_update_bufs(serv); svc_rqst_free(rqstp); clear_and_wake_up_bit(SP_VICTIM_REMAINS, &pool->sp_flags); } EXPORT_SYMBOL_GPL(svc_exit_thread); /* * Register an "inet" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. * * No netconfig infrastructure is available in the kernel, so * we map IP_ protocol numbers to netids by hand. * * Returns zero on success; a negative errno value is returned * if any error occurs. */ static int __svc_rpcb_register4(struct net *net, const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { const struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; const char *netid; int error; switch (protocol) { case IPPROTO_UDP: netid = RPCBIND_NETID_UDP; break; case IPPROTO_TCP: netid = RPCBIND_NETID_TCP; break; default: return -ENOPROTOOPT; } error = rpcb_v4_register(net, program, version, (const struct sockaddr *)&sin, netid); /* * User space didn't support rpcbind v4, so retry this * registration request with the legacy rpcbind v2 protocol. */ if (error == -EPROTONOSUPPORT) error = rpcb_register(net, program, version, protocol, port); return error; } #if IS_ENABLED(CONFIG_IPV6) /* * Register an "inet6" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. * * No netconfig infrastructure is available in the kernel, so * we map IP_ protocol numbers to netids by hand. * * Returns zero on success; a negative errno value is returned * if any error occurs. */ static int __svc_rpcb_register6(struct net *net, const u32 program, const u32 version, const unsigned short protocol, const unsigned short port) { const struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = htons(port), }; const char *netid; int error; switch (protocol) { case IPPROTO_UDP: netid = RPCBIND_NETID_UDP6; break; case IPPROTO_TCP: netid = RPCBIND_NETID_TCP6; break; default: return -ENOPROTOOPT; } error = rpcb_v4_register(net, program, version, (const struct sockaddr *)&sin6, netid); /* * User space didn't support rpcbind version 4, so we won't * use a PF_INET6 listener. */ if (error == -EPROTONOSUPPORT) error = -EAFNOSUPPORT; return error; } #endif /* IS_ENABLED(CONFIG_IPV6) */ /* * Register a kernel RPC service via rpcbind version 4. * * Returns zero on success; a negative errno value is returned * if any error occurs. */ static int __svc_register(struct net *net, const char *progname, const u32 program, const u32 version, const int family, const unsigned short protocol, const unsigned short port) { int error = -EAFNOSUPPORT; switch (family) { case PF_INET: error = __svc_rpcb_register4(net, program, version, protocol, port); break; #if IS_ENABLED(CONFIG_IPV6) case PF_INET6: error = __svc_rpcb_register6(net, program, version, protocol, port); #endif } trace_svc_register(progname, version, family, protocol, port, error); return error; } static int svc_rpcbind_set_version(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { return __svc_register(net, progp->pg_name, progp->pg_prog, version, family, proto, port); } int svc_generic_rpcbind_set(struct net *net, const struct svc_program *progp, u32 version, int family, unsigned short proto, unsigned short port) { const struct svc_version *vers = progp->pg_vers[version]; int error; if (vers == NULL) return 0; if (vers->vs_hidden) { trace_svc_noregister(progp->pg_name, version, proto, port, family, 0); return 0; } /* * Don't register a UDP port if we need congestion * control. */ if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP) return 0; error = svc_rpcbind_set_version(net, progp, version, family, proto, port); return (vers->vs_rpcb_optnl) ? 0 : error; } EXPORT_SYMBOL_GPL(svc_generic_rpcbind_set); /** * svc_register - register an RPC service with the local portmapper * @serv: svc_serv struct for the service to register * @net: net namespace for the service to register * @family: protocol family of service's listener socket * @proto: transport protocol number to advertise * @port: port to advertise * * Service is registered for any address in the passed-in protocol family */ int svc_register(const struct svc_serv *serv, struct net *net, const int family, const unsigned short proto, const unsigned short port) { unsigned int p, i; int error = 0; WARN_ON_ONCE(proto == 0 && port == 0); if (proto == 0 && port == 0) return -EINVAL; for (p = 0; p < serv->sv_nprogs; p++) { struct svc_program *progp = &serv->sv_programs[p]; for (i = 0; i < progp->pg_nvers; i++) { error = progp->pg_rpcbind_set(net, progp, i, family, proto, port); if (error < 0) { printk(KERN_WARNING "svc: failed to register " "%sv%u RPC service (errno %d).\n", progp->pg_name, i, -error); break; } } } return error; } /* * If user space is running rpcbind, it should take the v4 UNSET * and clear everything for this [program, version]. If user space * is running portmap, it will reject the v4 UNSET, but won't have * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient * in this case to clear all existing entries for [program, version]. */ static void __svc_unregister(struct net *net, const u32 program, const u32 version, const char *progname) { int error; error = rpcb_v4_register(net, program, version, NULL, ""); /* * User space didn't support rpcbind v4, so retry this * request with the legacy rpcbind v2 protocol. */ if (error == -EPROTONOSUPPORT) error = rpcb_register(net, program, version, 0, 0); trace_svc_unregister(progname, version, error); } /* * All netids, bind addresses and ports registered for [program, version] * are removed from the local rpcbind database (if the service is not * hidden) to make way for a new instance of the service. * * The result of unregistration is reported via dprintk for those who want * verification of the result, but is otherwise not important. */ static void svc_unregister(const struct svc_serv *serv, struct net *net) { struct sighand_struct *sighand; unsigned long flags; unsigned int p, i; clear_thread_flag(TIF_SIGPENDING); for (p = 0; p < serv->sv_nprogs; p++) { struct svc_program *progp = &serv->sv_programs[p]; for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; if (progp->pg_vers[i]->vs_hidden) continue; __svc_unregister(net, progp->pg_prog, i, progp->pg_name); } } rcu_read_lock(); sighand = rcu_dereference(current->sighand); spin_lock_irqsave(&sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(&sighand->siglock, flags); rcu_read_unlock(); } /* * dprintk the given error with the address of the client that caused it. */ #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static __printf(2, 3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) { struct va_format vaf; va_list args; char buf[RPC_MAX_ADDRBUFLEN]; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; dprintk("svc: %s: %pV", svc_print_addr(rqstp, buf, sizeof(buf)), &vaf); va_end(args); } #else static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {} #endif __be32 svc_generic_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *ret) { const struct svc_version *versp = NULL; /* compiler food */ const struct svc_procedure *procp = NULL; if (rqstp->rq_vers >= progp->pg_nvers ) goto err_bad_vers; versp = progp->pg_vers[rqstp->rq_vers]; if (!versp) goto err_bad_vers; /* * Some protocol versions (namely NFSv4) require some form of * congestion control. (See RFC 7530 section 3.1 paragraph 2) * In other words, UDP is not allowed. We mark those when setting * up the svc_xprt, and verify that here. * * The spec is not very clear about what error should be returned * when someone tries to access a server that is listening on UDP * for lower versions. RPC_PROG_MISMATCH seems to be the closest * fit. */ if (versp->vs_need_cong_ctrl && rqstp->rq_xprt && !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags)) goto err_bad_vers; if (rqstp->rq_proc >= versp->vs_nproc) goto err_bad_proc; rqstp->rq_procinfo = procp = &versp->vs_proc[rqstp->rq_proc]; /* Initialize storage for argp and resp */ memset(rqstp->rq_argp, 0, procp->pc_argzero); memset(rqstp->rq_resp, 0, procp->pc_ressize); /* Bump per-procedure stats counter */ this_cpu_inc(versp->vs_count[rqstp->rq_proc]); ret->dispatch = versp->vs_dispatch; return rpc_success; err_bad_vers: ret->mismatch.lovers = progp->pg_lovers; ret->mismatch.hivers = progp->pg_hivers; return rpc_prog_mismatch; err_bad_proc: return rpc_proc_unavail; } EXPORT_SYMBOL_GPL(svc_generic_init_request); /* * Common routine for processing the RPC request. */ static int svc_process_common(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_res_stream; struct svc_program *progp = NULL; const struct svc_procedure *procp = NULL; struct svc_serv *serv = rqstp->rq_server; struct svc_process_info process; enum svc_auth_status auth_res; unsigned int aoffset; int pr, rc; __be32 *p; /* Reset the accept_stat for the RPC */ rqstp->rq_accept_statp = NULL; /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); clear_bit(RQ_DROPME, &rqstp->rq_flags); /* Construct the first words of the reply: */ svcxdr_init_encode(rqstp); xdr_stream_encode_be32(xdr, rqstp->rq_xid); xdr_stream_encode_be32(xdr, rpc_reply); p = xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 4); if (unlikely(!p)) goto err_short_len; if (*p++ != cpu_to_be32(RPC_VERSION)) goto err_bad_rpc; xdr_stream_encode_be32(xdr, rpc_msg_accepted); rqstp->rq_prog = be32_to_cpup(p++); rqstp->rq_vers = be32_to_cpup(p++); rqstp->rq_proc = be32_to_cpup(p); for (pr = 0; pr < serv->sv_nprogs; pr++) if (rqstp->rq_prog == serv->sv_programs[pr].pg_prog) progp = &serv->sv_programs[pr]; /* * Decode auth data, and add verifier to reply buffer. * We do this before anything else in order to get a decent * auth verifier. */ auth_res = svc_authenticate(rqstp); /* Also give the program a chance to reject this call: */ if (auth_res == SVC_OK && progp) auth_res = progp->pg_authenticate(rqstp); trace_svc_authenticate(rqstp, auth_res); switch (auth_res) { case SVC_OK: break; case SVC_GARBAGE: rqstp->rq_auth_stat = rpc_autherr_badcred; goto err_bad_auth; case SVC_DENIED: goto err_bad_auth; case SVC_CLOSE: goto close; case SVC_DROP: goto dropit; case SVC_COMPLETE: goto sendit; default: pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res); rqstp->rq_auth_stat = rpc_autherr_failed; goto err_bad_auth; } if (progp == NULL) goto err_bad_prog; switch (progp->pg_init_request(rqstp, progp, &process)) { case rpc_success: break; case rpc_prog_unavail: goto err_bad_prog; case rpc_prog_mismatch: goto err_bad_vers; case rpc_proc_unavail: goto err_bad_proc; } procp = rqstp->rq_procinfo; /* Should this check go into the dispatcher? */ if (!procp || !procp->pc_func) goto err_bad_proc; /* Syntactic check complete */ if (serv->sv_stats) serv->sv_stats->rpccnt++; trace_svc_process(rqstp, progp->pg_name); aoffset = xdr_stream_pos(xdr); /* un-reserve some of the out-queue now that we have a * better idea of reply size */ if (procp->pc_xdrressize) svc_reserve_auth(rqstp, procp->pc_xdrressize<<2); /* Call the function that processes the request. */ rc = process.dispatch(rqstp); xdr_finish_decode(xdr); if (!rc) goto dropit; if (rqstp->rq_auth_stat != rpc_auth_ok) goto err_bad_auth; if (*rqstp->rq_accept_statp != rpc_success) xdr_truncate_encode(xdr, aoffset); if (procp->pc_encode == NULL) goto dropit; sendit: if (svc_authorise(rqstp)) goto close_xprt; return 1; /* Caller can now send it */ dropit: svc_authorise(rqstp); /* doesn't hurt to call this twice */ dprintk("svc: svc_process dropit\n"); return 0; close: svc_authorise(rqstp); close_xprt: if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) svc_xprt_close(rqstp->rq_xprt); dprintk("svc: svc_process close\n"); return 0; err_short_len: svc_printk(rqstp, "short len %u, dropping request\n", rqstp->rq_arg.len); goto close_xprt; err_bad_rpc: if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); xdr_stream_encode_u32(xdr, RPC_MISMATCH); /* Only RPCv2 supported */ xdr_stream_encode_u32(xdr, RPC_VERSION); xdr_stream_encode_u32(xdr, RPC_VERSION); return 1; /* don't wrap */ err_bad_auth: dprintk("svc: authentication failed (%d)\n", be32_to_cpu(rqstp->rq_auth_stat)); if (serv->sv_stats) serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of reply status: */ xdr_truncate_encode(xdr, XDR_UNIT * 2); xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); xdr_stream_encode_u32(xdr, RPC_AUTH_ERROR); xdr_stream_encode_be32(xdr, rqstp->rq_auth_stat); goto sendit; err_bad_prog: dprintk("svc: unknown program %d\n", rqstp->rq_prog); if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_unavail; goto sendit; err_bad_vers: svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", rqstp->rq_vers, rqstp->rq_prog, progp->pg_name); if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_mismatch; /* * svc_authenticate() has already added the verifier and * advanced the stream just past rq_accept_statp. */ xdr_stream_encode_u32(xdr, process.mismatch.lovers); xdr_stream_encode_u32(xdr, process.mismatch.hivers); goto sendit; err_bad_proc: svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc); if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; } /* * Drop request */ static void svc_drop(struct svc_rqst *rqstp) { trace_svc_drop(rqstp); } static void svc_release_rqst(struct svc_rqst *rqstp) { const struct svc_procedure *procp = rqstp->rq_procinfo; if (procp && procp->pc_release) procp->pc_release(rqstp); } /** * svc_process - Execute one RPC transaction * @rqstp: RPC transaction context * */ void svc_process(struct svc_rqst *rqstp) { struct kvec *resv = &rqstp->rq_res.head[0]; __be32 *p; #if IS_ENABLED(CONFIG_FAIL_SUNRPC) if (!fail_sunrpc.ignore_server_disconnect && should_fail(&fail_sunrpc.attr, 1)) svc_xprt_deferred_close(rqstp->rq_xprt); #endif /* * Setup response xdr_buf. * Initially it has just one page */ rqstp->rq_next_page = &rqstp->rq_respages[1]; resv->iov_base = page_address(rqstp->rq_respages[0]); resv->iov_len = 0; rqstp->rq_res.pages = rqstp->rq_next_page; rqstp->rq_res.len = 0; rqstp->rq_res.page_base = 0; rqstp->rq_res.page_len = 0; rqstp->rq_res.buflen = PAGE_SIZE; rqstp->rq_res.tail[0].iov_base = NULL; rqstp->rq_res.tail[0].iov_len = 0; svcxdr_init_decode(rqstp); p = xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2); if (unlikely(!p)) goto out_drop; rqstp->rq_xid = *p++; if (unlikely(*p != rpc_call)) goto out_baddir; if (!svc_process_common(rqstp)) { svc_release_rqst(rqstp); goto out_drop; } svc_send(rqstp); svc_release_rqst(rqstp); return; out_baddir: svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", be32_to_cpu(*p)); if (rqstp->rq_server->sv_stats) rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); } #if defined(CONFIG_SUNRPC_BACKCHANNEL) /** * svc_process_bc - process a reverse-direction RPC request * @req: RPC request to be used for client-side processing * @rqstp: server-side execution context * */ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) { struct rpc_timeout timeout = { .to_increment = 0, }; struct rpc_task *task; int proc_error; /* Build the svc_rqst used by the common processing routine */ rqstp->rq_xid = req->rq_xid; rqstp->rq_prot = req->rq_xprt->prot; rqstp->rq_bc_net = req->rq_xprt->xprt_net; rqstp->rq_addrlen = sizeof(req->rq_xprt->addr); memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen); memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg)); memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res)); /* Adjust the argument buffer length */ rqstp->rq_arg.len = req->rq_private_buf.len; if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) { rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len; rqstp->rq_arg.page_len = 0; } else if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) rqstp->rq_arg.page_len = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; else rqstp->rq_arg.len = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len; /* Reset the response buffer */ rqstp->rq_res.head[0].iov_len = 0; /* * Skip the XID and calldir fields because they've already * been processed by the caller. */ svcxdr_init_decode(rqstp); if (!xdr_inline_decode(&rqstp->rq_arg_stream, XDR_UNIT * 2)) return; /* Parse and execute the bc call */ proc_error = svc_process_common(rqstp); atomic_dec(&req->rq_xprt->bc_slot_count); if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); svc_release_rqst(rqstp); return; } /* Finally, send the reply synchronously */ if (rqstp->bc_to_initval > 0) { timeout.to_initval = rqstp->bc_to_initval; timeout.to_retries = rqstp->bc_to_retries; } else { timeout.to_initval = req->rq_xprt->timeout->to_initval; timeout.to_retries = req->rq_xprt->timeout->to_retries; } timeout.to_maxval = timeout.to_initval; memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req, &timeout); svc_release_rqst(rqstp); if (IS_ERR(task)) return; WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); rpc_put_task(task); } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** * svc_max_payload - Return transport-specific limit on the RPC payload * @rqstp: RPC transaction context * * Returns the maximum number of payload bytes the current transport * allows. */ u32 svc_max_payload(const struct svc_rqst *rqstp) { u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload; if (rqstp->rq_server->sv_max_payload < max) max = rqstp->rq_server->sv_max_payload; return max; } EXPORT_SYMBOL_GPL(svc_max_payload); /** * svc_proc_name - Return RPC procedure name in string form * @rqstp: svc_rqst to operate on * * Return value: * Pointer to a NUL-terminated string */ const char *svc_proc_name(const struct svc_rqst *rqstp) { if (rqstp && rqstp->rq_procinfo) return rqstp->rq_procinfo->pc_name; return "unknown"; } /** * svc_encode_result_payload - mark a range of bytes as a result payload * @rqstp: svc_rqst to operate on * @offset: payload's byte offset in rqstp->rq_res * @length: size of payload, in bytes * * Returns zero on success, or a negative errno if a permanent * error occurred. */ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length) { return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset, length); } EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call * @rqstp: svc_rqst to operate on * @first: buffer containing first section of pathname * @p: buffer containing remaining section of pathname * @total: total length of the pathname argument * * The VFS symlink API demands a NUL-terminated pathname in mapped memory. * Returns pointer to a NUL-terminated string, or an ERR_PTR. Caller must free * the returned string. */ char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total) { size_t len, remaining; char *result, *dst; result = kmalloc(total + 1, GFP_KERNEL); if (!result) return ERR_PTR(-ESERVERFAULT); dst = result; remaining = total; len = min_t(size_t, total, first->iov_len); if (len) { memcpy(dst, first->iov_base, len); dst += len; remaining -= len; } if (remaining) { len = min_t(size_t, remaining, PAGE_SIZE); memcpy(dst, p, len); dst += len; } *dst = '\0'; /* Sanity check: Linux doesn't allow the pathname argument to * contain a NUL byte. */ if (strlen(result) != total) { kfree(result); return ERR_PTR(-EINVAL); } return result; } EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname);
205 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_MROUTE_H #define __LINUX_MROUTE_H #include <linux/in.h> #include <linux/pim.h> #include <net/fib_rules.h> #include <net/fib_notifier.h> #include <uapi/linux/mroute.h> #include <linux/mroute_base.h> #include <linux/sockptr.h> #ifdef CONFIG_IP_MROUTE static inline int ip_mroute_opt(int opt) { return opt >= MRT_BASE && opt <= MRT_MAX; } int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); int ipmr_ioctl(struct sock *sk, int cmd, void *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); bool ipmr_rule_default(const struct fib_rule *rule); int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); #else static inline int ip_mroute_setsockopt(struct sock *sock, int optname, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } static inline int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int ipmr_ioctl(struct sock *sk, int cmd, void *arg) { return -ENOIOCTLCMD; } static inline int ip_mr_init(void) { return 0; } static inline int ip_mroute_opt(int opt) { return 0; } static inline bool ipmr_rule_default(const struct fib_rule *rule) { return true; } static inline int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { return 1; } #endif #define VIFF_STATIC 0x8000 struct mfc_cache_cmp_arg { __be32 mfc_mcastgrp; __be32 mfc_origin; }; /** * struct mfc_cache - multicast routing entries * @_c: Common multicast routing information; has to be first [for casting] * @mfc_mcastgrp: destination multicast group address * @mfc_origin: source address * @cmparg: used for rhashtable comparisons */ struct mfc_cache { struct mr_mfc _c; union { struct { __be32 mfc_mcastgrp; __be32 mfc_origin; }; struct mfc_cache_cmp_arg cmparg; }; }; struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid); #endif
4 4 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2022-2025 NXP * Copyright 2024 Furong Xu <0x1207@gmail.com> */ #include "common.h" #include "netlink.h" struct mm_req_info { struct ethnl_req_info base; }; struct mm_reply_data { struct ethnl_reply_data base; struct ethtool_mm_state state; struct ethtool_mm_stats stats; }; #define MM_REPDATA(__reply_base) \ container_of(__reply_base, struct mm_reply_data, base) #define ETHTOOL_MM_STAT_CNT \ (__ETHTOOL_A_MM_STAT_CNT - (ETHTOOL_A_MM_STAT_PAD + 1)) const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1] = { [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static int mm_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct mm_reply_data *data = MM_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const struct ethtool_ops *ops; int ret; ops = dev->ethtool_ops; if (!ops->get_mm) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = ops->get_mm(dev, &data->state); if (ret) goto out_complete; if (ops->get_mm_stats && (req_base->flags & ETHTOOL_FLAG_STATS)) ops->get_mm_stats(dev, &data->stats); out_complete: ethnl_ops_complete(dev); return ret; } static int mm_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int len = 0; len += nla_total_size(sizeof(u8)); /* _MM_PMAC_ENABLED */ len += nla_total_size(sizeof(u8)); /* _MM_TX_ENABLED */ len += nla_total_size(sizeof(u8)); /* _MM_TX_ACTIVE */ len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_ENABLED */ len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_STATUS */ len += nla_total_size(sizeof(u32)); /* _MM_VERIFY_TIME */ len += nla_total_size(sizeof(u32)); /* _MM_MAX_VERIFY_TIME */ len += nla_total_size(sizeof(u32)); /* _MM_TX_MIN_FRAG_SIZE */ len += nla_total_size(sizeof(u32)); /* _MM_RX_MIN_FRAG_SIZE */ if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _MM_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_MM_STAT_CNT; return len; } static int mm_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, ETHTOOL_A_MM_STAT_PAD)) return -EMSGSIZE; return 0; } static int mm_put_stats(struct sk_buff *skb, const struct ethtool_mm_stats *stats) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_MM_STATS); if (!nest) return -EMSGSIZE; if (mm_put_stat(skb, stats->MACMergeFrameAssErrorCount, ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS) || mm_put_stat(skb, stats->MACMergeFrameSmdErrorCount, ETHTOOL_A_MM_STAT_SMD_ERRORS) || mm_put_stat(skb, stats->MACMergeFrameAssOkCount, ETHTOOL_A_MM_STAT_REASSEMBLY_OK) || mm_put_stat(skb, stats->MACMergeFragCountRx, ETHTOOL_A_MM_STAT_RX_FRAG_COUNT) || mm_put_stat(skb, stats->MACMergeFragCountTx, ETHTOOL_A_MM_STAT_TX_FRAG_COUNT) || mm_put_stat(skb, stats->MACMergeHoldCount, ETHTOOL_A_MM_STAT_HOLD_COUNT)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int mm_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct mm_reply_data *data = MM_REPDATA(reply_base); const struct ethtool_mm_state *state = &data->state; if (nla_put_u8(skb, ETHTOOL_A_MM_TX_ENABLED, state->tx_enabled) || nla_put_u8(skb, ETHTOOL_A_MM_TX_ACTIVE, state->tx_active) || nla_put_u8(skb, ETHTOOL_A_MM_PMAC_ENABLED, state->pmac_enabled) || nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_ENABLED, state->verify_enabled) || nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_STATUS, state->verify_status) || nla_put_u32(skb, ETHTOOL_A_MM_VERIFY_TIME, state->verify_time) || nla_put_u32(skb, ETHTOOL_A_MM_MAX_VERIFY_TIME, state->max_verify_time) || nla_put_u32(skb, ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, state->tx_min_frag_size) || nla_put_u32(skb, ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, state->rx_min_frag_size)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && mm_put_stats(skb, &data->stats)) return -EMSGSIZE; return 0; } const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1] = { [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MM_VERIFY_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_MM_VERIFY_TIME] = NLA_POLICY_RANGE(NLA_U32, 1, 128), [ETHTOOL_A_MM_TX_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_MM_PMAC_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_MM_TX_MIN_FRAG_SIZE] = NLA_POLICY_RANGE(NLA_U32, 60, 252), }; static void mm_state_to_cfg(const struct ethtool_mm_state *state, struct ethtool_mm_cfg *cfg) { /* We could also compare state->verify_status against * ETHTOOL_MM_VERIFY_STATUS_DISABLED, but state->verify_enabled * is more like an administrative state which should be seen in * ETHTOOL_MSG_MM_GET replies. For example, a port with verification * disabled might be in the ETHTOOL_MM_VERIFY_STATUS_INITIAL * if it's down. */ cfg->verify_enabled = state->verify_enabled; cfg->verify_time = state->verify_time; cfg->tx_enabled = state->tx_enabled; cfg->pmac_enabled = state->pmac_enabled; cfg->tx_min_frag_size = state->tx_min_frag_size; } static int ethnl_set_mm_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_mm && ops->set_mm ? 1 : -EOPNOTSUPP; } static int ethnl_set_mm(struct ethnl_req_info *req_info, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct net_device *dev = req_info->dev; struct ethtool_mm_state state = {}; struct nlattr **tb = info->attrs; struct ethtool_mm_cfg cfg = {}; bool mod = false; int ret; ret = dev->ethtool_ops->get_mm(dev, &state); if (ret) return ret; mm_state_to_cfg(&state, &cfg); ethnl_update_bool(&cfg.verify_enabled, tb[ETHTOOL_A_MM_VERIFY_ENABLED], &mod); ethnl_update_u32(&cfg.verify_time, tb[ETHTOOL_A_MM_VERIFY_TIME], &mod); ethnl_update_bool(&cfg.tx_enabled, tb[ETHTOOL_A_MM_TX_ENABLED], &mod); ethnl_update_bool(&cfg.pmac_enabled, tb[ETHTOOL_A_MM_PMAC_ENABLED], &mod); ethnl_update_u32(&cfg.tx_min_frag_size, tb[ETHTOOL_A_MM_TX_MIN_FRAG_SIZE], &mod); if (!mod) return 0; if (cfg.verify_time > state.max_verify_time) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MM_VERIFY_TIME], "verifyTime exceeds device maximum"); return -ERANGE; } if (cfg.verify_enabled && !cfg.tx_enabled) { NL_SET_ERR_MSG(extack, "Verification requires TX enabled"); return -EINVAL; } if (cfg.tx_enabled && !cfg.pmac_enabled) { NL_SET_ERR_MSG(extack, "TX enabled requires pMAC enabled"); return -EINVAL; } ret = dev->ethtool_ops->set_mm(dev, &cfg, extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_mm_request_ops = { .request_cmd = ETHTOOL_MSG_MM_GET, .reply_cmd = ETHTOOL_MSG_MM_GET_REPLY, .hdr_attr = ETHTOOL_A_MM_HEADER, .req_info_size = sizeof(struct mm_req_info), .reply_data_size = sizeof(struct mm_reply_data), .prepare_data = mm_prepare_data, .reply_size = mm_reply_size, .fill_reply = mm_fill_reply, .set_validate = ethnl_set_mm_validate, .set = ethnl_set_mm, .set_ntf_cmd = ETHTOOL_MSG_MM_NTF, }; /* Returns whether a given device supports the MAC merge layer * (has an eMAC and a pMAC). Must be called under rtnl_lock() and * ethnl_ops_begin(). */ bool __ethtool_dev_mm_supported(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_mm_state state = {}; int ret = -EOPNOTSUPP; if (ops && ops->get_mm) ret = ops->get_mm(dev, &state); return !ret; } bool ethtool_dev_mm_supported(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; bool supported; int ret; ASSERT_RTNL(); if (!ops) return false; ret = ethnl_ops_begin(dev); if (ret < 0) return false; supported = __ethtool_dev_mm_supported(dev); ethnl_ops_complete(dev); return supported; } EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported); static void ethtool_mmsv_configure_tx(struct ethtool_mmsv *mmsv, bool tx_active) { if (mmsv->ops->configure_tx) mmsv->ops->configure_tx(mmsv, tx_active); } static void ethtool_mmsv_configure_pmac(struct ethtool_mmsv *mmsv, bool pmac_enabled) { if (mmsv->ops->configure_pmac) mmsv->ops->configure_pmac(mmsv, pmac_enabled); } static void ethtool_mmsv_send_mpacket(struct ethtool_mmsv *mmsv, enum ethtool_mpacket mpacket) { if (mmsv->ops->send_mpacket) mmsv->ops->send_mpacket(mmsv, mpacket); } /** * ethtool_mmsv_verify_timer - Timer for MAC Merge verification * @t: timer_list struct containing private info * * Verify the MAC Merge capability in the local TX direction, by * transmitting Verify mPackets up to 3 times. Wait until link * partner responds with a Response mPacket, otherwise fail. */ static void ethtool_mmsv_verify_timer(struct timer_list *t) { struct ethtool_mmsv *mmsv = timer_container_of(mmsv, t, verify_timer); unsigned long flags; bool rearm = false; spin_lock_irqsave(&mmsv->lock, flags); switch (mmsv->status) { case ETHTOOL_MM_VERIFY_STATUS_INITIAL: case ETHTOOL_MM_VERIFY_STATUS_VERIFYING: if (mmsv->verify_retries != 0) { ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_VERIFY); rearm = true; } else { mmsv->status = ETHTOOL_MM_VERIFY_STATUS_FAILED; } mmsv->verify_retries--; break; case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: ethtool_mmsv_configure_tx(mmsv, true); break; default: break; } if (rearm) { mod_timer(&mmsv->verify_timer, jiffies + msecs_to_jiffies(mmsv->verify_time)); } spin_unlock_irqrestore(&mmsv->lock, flags); } static void ethtool_mmsv_verify_timer_arm(struct ethtool_mmsv *mmsv) { if (mmsv->pmac_enabled && mmsv->tx_enabled && mmsv->verify_enabled && mmsv->status != ETHTOOL_MM_VERIFY_STATUS_FAILED && mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) { timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); mod_timer(&mmsv->verify_timer, jiffies); } } static void ethtool_mmsv_apply(struct ethtool_mmsv *mmsv) { /* If verification is disabled, configure FPE right away. * Otherwise let the timer code do it. */ if (!mmsv->verify_enabled) { ethtool_mmsv_configure_pmac(mmsv, mmsv->pmac_enabled); ethtool_mmsv_configure_tx(mmsv, mmsv->tx_enabled); } else { mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; if (netif_running(mmsv->dev)) ethtool_mmsv_verify_timer_arm(mmsv); } } /** * ethtool_mmsv_stop() - Stop MAC Merge Software Verification * @mmsv: MAC Merge Software Verification state * * Drivers should call this method in a state where the hardware is * about to lose state, like ndo_stop() or suspend(), and turning off * MAC Merge features would be superfluous. Otherwise, prefer * ethtool_mmsv_link_state_handle() with up=false. */ void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv) { timer_shutdown_sync(&mmsv->verify_timer); } EXPORT_SYMBOL_GPL(ethtool_mmsv_stop); /** * ethtool_mmsv_link_state_handle() - Inform MAC Merge Software Verification * of link state changes * @mmsv: MAC Merge Software Verification state * @up: True if device carrier is up and able to pass verification packets * * Calling context is expected to be from a task, interrupts enabled. */ void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up) { unsigned long flags; ethtool_mmsv_stop(mmsv); spin_lock_irqsave(&mmsv->lock, flags); if (up && mmsv->pmac_enabled) { /* VERIFY process requires pMAC enabled when NIC comes up */ ethtool_mmsv_configure_pmac(mmsv, true); /* New link => maybe new partner => new verification process */ ethtool_mmsv_apply(mmsv); } else { /* Reset the reported verification state while the link is down */ if (mmsv->verify_enabled) mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; /* No link or pMAC not enabled */ ethtool_mmsv_configure_pmac(mmsv, false); ethtool_mmsv_configure_tx(mmsv, false); } spin_unlock_irqrestore(&mmsv->lock, flags); } EXPORT_SYMBOL_GPL(ethtool_mmsv_link_state_handle); /** * ethtool_mmsv_event_handle() - Inform MAC Merge Software Verification * of interrupt-based events * @mmsv: MAC Merge Software Verification state * @event: Event which took place (packet transmission or reception) * * Calling context expects to have interrupts disabled. */ void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, enum ethtool_mmsv_event event) { /* This is interrupt context, just spin_lock() */ spin_lock(&mmsv->lock); if (!mmsv->pmac_enabled) goto unlock; switch (event) { case ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET: /* Link partner has sent verify mPacket */ ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_RESPONSE); break; case ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET: /* Local device has sent verify mPacket */ if (mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) mmsv->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING; break; case ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET: /* Link partner has sent response mPacket */ if (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING) mmsv->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED; break; } unlock: spin_unlock(&mmsv->lock); } EXPORT_SYMBOL_GPL(ethtool_mmsv_event_handle); static bool ethtool_mmsv_is_tx_active(struct ethtool_mmsv *mmsv) { /* TX is active if administratively enabled, and verification either * succeeded, or was administratively disabled. */ return mmsv->tx_enabled && (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED || mmsv->status == ETHTOOL_MM_VERIFY_STATUS_DISABLED); } /** * ethtool_mmsv_get_mm() - get_mm() hook for MAC Merge Software Verification * @mmsv: MAC Merge Software Verification state * @state: see struct ethtool_mm_state * * Drivers are expected to call this from their ethtool_ops :: get_mm() * method. */ void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_state *state) { unsigned long flags; spin_lock_irqsave(&mmsv->lock, flags); state->max_verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; state->verify_enabled = mmsv->verify_enabled; state->pmac_enabled = mmsv->pmac_enabled; state->verify_time = mmsv->verify_time; state->tx_enabled = mmsv->tx_enabled; state->verify_status = mmsv->status; state->tx_active = ethtool_mmsv_is_tx_active(mmsv); spin_unlock_irqrestore(&mmsv->lock, flags); } EXPORT_SYMBOL_GPL(ethtool_mmsv_get_mm); /** * ethtool_mmsv_set_mm() - set_mm() hook for MAC Merge Software Verification * @mmsv: MAC Merge Software Verification state * @cfg: see struct ethtool_mm_cfg * * Drivers are expected to call this from their ethtool_ops :: set_mm() * method. */ void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg) { unsigned long flags; /* Wait for the verification that's currently in progress to finish */ ethtool_mmsv_stop(mmsv); spin_lock_irqsave(&mmsv->lock, flags); mmsv->verify_enabled = cfg->verify_enabled; mmsv->pmac_enabled = cfg->pmac_enabled; mmsv->verify_time = cfg->verify_time; mmsv->tx_enabled = cfg->tx_enabled; if (!cfg->verify_enabled) mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; ethtool_mmsv_apply(mmsv); spin_unlock_irqrestore(&mmsv->lock, flags); } EXPORT_SYMBOL_GPL(ethtool_mmsv_set_mm); /** * ethtool_mmsv_init() - Initialize MAC Merge Software Verification state * @mmsv: MAC Merge Software Verification state * @dev: Pointer to network interface * @ops: Methods for implementing the generic functionality * * The MAC Merge Software Verification is a timer- and event-based state * machine intended for network interfaces which lack a hardware-based * TX verification process (as per IEEE 802.3 clause 99.4.3). The timer * is managed by the core code, whereas events are supplied by the * driver explicitly calling one of the other API functions. */ void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, const struct ethtool_mmsv_ops *ops) { mmsv->ops = ops; mmsv->dev = dev; mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; mmsv->verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); spin_lock_init(&mmsv->lock); } EXPORT_SYMBOL_GPL(ethtool_mmsv_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/suspend.h> #include <linux/suspend_ioctls.h> #include <linux/utsname.h> #include <linux/freezer.h> #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/cpuidle.h> #include <linux/crypto.h> struct swsusp_info { struct new_utsname uts; u32 version_code; unsigned long num_physpages; int cpus; unsigned long image_pages; unsigned long pages; unsigned long size; } __aligned(PAGE_SIZE); #if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) extern bool filesystem_freeze_enabled; #endif #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void __init hibernate_reserved_size_init(void); extern void __init hibernate_image_size_init(void); #ifdef CONFIG_ARCH_HIBERNATION_HEADER /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) static inline int init_header_complete(struct swsusp_info *info) { return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); } static inline const char *check_image_kernel(struct swsusp_info *info) { return arch_hibernation_header_restore(info) ? "architecture specific data" : NULL; } #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] */ #define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT) /* * Keep 1 MB of memory free so that device drivers can allocate some pages in * their .suspend() routines without breaking the suspend to disk. */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) asmlinkage int swsusp_save(void); /* kernel/power/hibernate.c */ extern bool freezer_test_done; extern char hib_comp_algo[CRYPTO_MAX_ALG_NAME]; /* kernel/power/swap.c */ extern unsigned int swsusp_header_flags; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); #ifdef CONFIG_STRICT_KERNEL_RWX /* kernel/power/snapshot.c */ extern void enable_restore_image_protection(void); #else static inline void enable_restore_image_protection(void) {} #endif /* CONFIG_STRICT_KERNEL_RWX */ extern bool hibernation_in_progress(void); #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} static inline bool hibernation_in_progress(void) { return false; } #endif /* !CONFIG_HIBERNATION */ #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ .name = __stringify(_name), \ .mode = 0644, \ }, \ .show = _name##_show, \ .store = _name##_store, \ } #define power_attr_ro(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ .name = __stringify(_name), \ .mode = S_IRUGO, \ }, \ .show = _name##_show, \ } /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ extern unsigned long reserved_size; extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); extern void clear_or_poison_free_pages(void); /* * Auxiliary structure used for reading the snapshot image data and * metadata from and writing them to the list of page backup entries * (PBEs) which is the main data structure of swsusp. * * Using struct snapshot_handle we can transfer the image, including its * metadata, as a continuous sequence of bytes with the help of * snapshot_read_next() and snapshot_write_next(). * * The code that writes the image to a storage or transfers it to * the user land is required to use snapshot_read_next() for this * purpose and it should not make any assumptions regarding the internal * structure of the image. Similarly, the code that reads the image from * a storage or transfers it from the user land is required to use * snapshot_write_next(). * * This may allow us to change the internal structure of the image * in the future with considerably less effort. */ struct snapshot_handle { unsigned int cur; /* number of the block of PAGE_SIZE bytes the * next operation will refer to (ie. current) */ void *buffer; /* address of the block to read from * or write to */ int sync_read; /* Set to one to notify the caller of * snapshot_write_next() that it may * need to call wait_on_bio_chain() */ }; /* This macro returns the address from/to which the caller of * snapshot_read_next()/snapshot_write_next() is allowed to * read/write data after the function returns */ #define data_of(handle) ((handle).buffer) extern unsigned int snapshot_additional_pages(struct zone *zone); extern unsigned long snapshot_get_image_size(void); extern int snapshot_read_next(struct snapshot_handle *handle); extern int snapshot_write_next(struct snapshot_handle *handle); int snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); extern bool hibernate_acquire(void); extern void hibernate_release(void); extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); extern int swsusp_swap_in_use(void); /* * Flags that can be passed from the hibernatig hernel to the "boot" kernel in * the image header. */ #define SF_COMPRESSION_ALG_LZO 0 /* dummy, details given below */ #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 #define SF_CRC32_MODE 4 #define SF_HW_SIG 8 /* * Bit to indicate the compression algorithm to be used(for LZ4). The same * could be checked while saving/loading image to/from disk to use the * corresponding algorithms. * * By default, LZO compression is enabled if SF_CRC32_MODE is set. Use * SF_COMPRESSION_ALG_LZ4 to override this behaviour and use LZ4. * * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZO(dummy) -> Compression, LZO * SF_CRC32_MODE, SF_COMPRESSION_ALG_LZ4 -> Compression, LZ4 */ #define SF_COMPRESSION_ALG_LZ4 16 /* kernel/power/hibernate.c */ int swsusp_check(bool exclusive); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); void swsusp_close(void); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); #else static inline int swsusp_unmark(void) { return 0; } #endif struct __kernel_old_timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ extern const char * const pm_labels[]; extern const char *pm_states[]; extern const char *mem_sleep_states[]; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ #define mem_sleep_current PM_SUSPEND_ON static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } #endif /* !CONFIG_SUSPEND */ #ifdef CONFIG_PM_TEST_SUSPEND /* kernel/power/suspend_test.c */ extern void suspend_test_start(void); extern void suspend_test_finish(const char *label); #else /* !CONFIG_PM_TEST_SUSPEND */ static inline void suspend_test_start(void) {} static inline void suspend_test_finish(const char *label) {} #endif /* !CONFIG_PM_TEST_SUSPEND */ #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); #endif #ifdef CONFIG_HIGHMEM int restore_highmem(void); #else static inline unsigned int count_highmem_pages(void) { return 0; } static inline int restore_highmem(void) { return 0; } #endif /* * Suspend test levels */ enum { /* keep first */ TEST_NONE, TEST_CORE, TEST_CPUS, TEST_PLATFORM, TEST_DEVICES, TEST_FREEZER, /* keep last */ __TEST_AFTER_LAST }; #define TEST_FIRST TEST_NONE #define TEST_MAX (__TEST_AFTER_LAST - 1) #ifdef CONFIG_PM_SLEEP_DEBUG extern int pm_test_level; #else #define pm_test_level (TEST_NONE) #endif #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { int error; error = freeze_processes(); /* * freeze_processes() automatically thaws every task if freezing * fails. So we need not do anything extra upon error. */ if (error) return error; error = freeze_kernel_threads(); /* * freeze_kernel_threads() thaws only kernel threads upon freezing * failure. So we have to thaw the userspace tasks ourselves. */ if (error) thaw_processes(); return error; } static inline void suspend_thaw_processes(void) { thaw_processes(); } #else static inline int suspend_freeze_processes(void) { return 0; } static inline void suspend_thaw_processes(void) { } #endif #ifdef CONFIG_PM_AUTOSLEEP /* kernel/power/autosleep.c */ extern int pm_autosleep_init(void); extern int pm_autosleep_lock(void); extern void pm_autosleep_unlock(void); extern suspend_state_t pm_autosleep_state(void); extern int pm_autosleep_set_state(suspend_state_t state); #else /* !CONFIG_PM_AUTOSLEEP */ static inline int pm_autosleep_init(void) { return 0; } static inline int pm_autosleep_lock(void) { return 0; } static inline void pm_autosleep_unlock(void) {} static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } #endif /* !CONFIG_PM_AUTOSLEEP */ #ifdef CONFIG_PM_WAKELOCKS /* kernel/power/wakelock.c */ extern ssize_t pm_show_wakelocks(char *buf, bool show_active); extern int pm_wake_lock(const char *buf); extern int pm_wake_unlock(const char *buf); #endif /* !CONFIG_PM_WAKELOCKS */ static inline int pm_sleep_disable_secondary_cpus(void) { cpuidle_pause(); return suspend_disable_secondary_cpus(); } static inline void pm_sleep_enable_secondary_cpus(void) { suspend_enable_secondary_cpus(); cpuidle_resume(); } void dpm_save_errno(int err);
5 4 34 34 34 1 1 1 1 1 32 32 32 32 32 67 67 67 45 46 46 41 41 41 41 46 5 5 5 5 38 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2022, SUSE. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include "protocol.h" static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); static int mptcp_sched_default_get_send(struct mptcp_sock *msk) { struct sock *ssk; ssk = mptcp_subflow_get_send(msk); if (!ssk) return -EINVAL; mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); return 0; } static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk) { struct sock *ssk; ssk = mptcp_subflow_get_retrans(msk); if (!ssk) return -EINVAL; mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); return 0; } static struct mptcp_sched_ops mptcp_sched_default = { .get_send = mptcp_sched_default_get_send, .get_retrans = mptcp_sched_default_get_retrans, .name = "default", .owner = THIS_MODULE, }; /* Must be called with rcu read lock held */ struct mptcp_sched_ops *mptcp_sched_find(const char *name) { struct mptcp_sched_ops *sched, *ret = NULL; list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { if (!strcmp(sched->name, name)) { ret = sched; break; } } return ret; } /* Build string with list of available scheduler values. * Similar to tcp_get_available_congestion_control() */ void mptcp_get_available_schedulers(char *buf, size_t maxlen) { struct mptcp_sched_ops *sched; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", sched->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } int mptcp_validate_scheduler(struct mptcp_sched_ops *sched) { if (!sched->get_send) { pr_err("%s does not implement required ops\n", sched->name); return -EINVAL; } return 0; } int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { int ret; ret = mptcp_validate_scheduler(sched); if (ret) return ret; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { spin_unlock(&mptcp_sched_list_lock); return -EEXIST; } list_add_tail_rcu(&sched->list, &mptcp_sched_list); spin_unlock(&mptcp_sched_list_lock); pr_debug("%s registered\n", sched->name); return 0; } void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) { if (sched == &mptcp_sched_default) return; spin_lock(&mptcp_sched_list_lock); list_del_rcu(&sched->list); spin_unlock(&mptcp_sched_list_lock); } void mptcp_sched_init(void) { mptcp_register_scheduler(&mptcp_sched_default); } int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched) { if (!sched) sched = &mptcp_sched_default; if (!bpf_try_module_get(sched, sched->owner)) return -EBUSY; msk->sched = sched; if (msk->sched->init) msk->sched->init(msk); pr_debug("sched=%s\n", msk->sched->name); return 0; } void mptcp_release_sched(struct mptcp_sock *msk) { struct mptcp_sched_ops *sched = msk->sched; if (!sched) return; msk->sched = NULL; if (sched->release) sched->release(msk); bpf_module_put(sched, sched->owner); } void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled) { WRITE_ONCE(subflow->scheduled, scheduled); } int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); /* the following check is moved out of mptcp_subflow_get_send */ if (__mptcp_check_fallback(msk)) { if (msk->first && __tcp_can_send(msk->first) && sk_stream_memory_free(msk->first)) { mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); return 0; } return -EINVAL; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_send(msk); return msk->sched->get_send(msk); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); /* the following check is moved out of mptcp_subflow_get_retrans */ if (__mptcp_check_fallback(msk)) return -EINVAL; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_retrans(msk); if (msk->sched->get_retrans) return msk->sched->get_retrans(msk); return msk->sched->get_send(msk); }
258 257 255 259 178 177 177 178 42 422 422 422 422 24 25 115 117 14 14 14 45 45 45 14 14 14 14 14 108 140 140 138 375 6 6 6 6 6 6 115 24 6 6 6 6 6 6 6 6 6 6 6 6 6 10 10 9 10 10 10 4 4 1 4 4 1 1 10 10 9 7 7 7 7 10 10 10 10 10 9 1 10 8 8 10 10 10 10 10 10 10 10 10 3 10 35 34 35 19 1 1 2 35 35 23 23 6 6 34 17 30 34 9 4 1 3 8 8 18 18 18 18 18 18 4 18 18 5 16 121 121 121 120 96 96 96 121 119 121 121 121 120 120 120 120 115 40 40 40 40 121 120 33 8 8 33 33 15 19 19 2 19 1 18 8 8 15 15 15 15 27 32 33 32 3 32 32 32 26 32 32 32 32 24 1 1 1 1 1 1 1 1 1 1 1 32 60 58 60 34 35 10 10 3 35 1 34 7 7 7 7 1 6 2 1 6 6 3 3 3 33 33 33 1 1 33 33 8 1 1 33 6 9 9 34 1 1 352 349 350 250 313 351 350 349 352 353 352 346 353 352 3 349 172 172 351 351 352 349 12 12 12 12 12 12 9 9 2 8 5 12 12 6 6 5 1 12 115 114 115 115 115 114 117 2 117 116 117 117 117 117 117 116 117 117 117 117 117 113 114 114 114 114 117 116 2 2 117 115 94 96 96 115 113 117 107 108 106 106 106 105 74 74 67 67 117 117 115 115 115 120 120 118 118 24 120 120 120 109 109 107 107 106 119 116 116 117 120 119 115 111 24 119 119 120 121 122 122 122 120 119 120 120 2 120 103 258 258 257 258 257 26 258 258 254 258 256 255 258 259 259 256 258 259 257 257 254 25 25 253 254 250 252 253 254 250 252 238 124 122 223 237 235 225 224 105 224 223 235 236 235 236 236 235 238 225 124 124 124 123 124 122 124 124 124 121 124 124 124 124 375 375 375 375 375 375 375 375 375 1 1 1 1 5 6 6 6 6 6 5 5 5 5 5 5 5 5 5 4 5 5 5 4 4 5 5 5 5 5 5 5 5 5 5 5 4 4 4 5 5 5 5 5 5 1 5 1 5 5 5 5 5 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); INIT_HLIST_NODE(&f6i->gc_link); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free(struct net *net, struct fib6_node *fn) { kfree_rcu(fn, rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; new_tb = fib6_alloc_table(net, id); if (!new_tb) return NULL; spin_lock_bh(&net->ipv6.fib_table_hash_lock); tb = fib6_get_table(net, id); if (unlikely(tb)) { spin_unlock_bh(&net->ipv6.fib_table_hash_lock); kfree(new_tb); return tb; } fib6_link_table(net, new_tb); spin_unlock_bh(&net->ipv6.fib_table_hash_lock); return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct hlist_head *head; struct fib6_table *tb; if (!id) id = RT6_TABLE_MAIN; head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; /* See comment in fib6_link_table(). RCU is not required, * but rcu_dereference_raw() is used to avoid data-race. */ hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) if (tb->tb6_id == id) return tb; return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; unsigned int nsiblings; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; nsiblings = READ_ONCE(rt->fib6_nsiblings); if (nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int e = 0, s_e; struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; unsigned int h, s_h; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) { err = -ENOMEM; goto unlock; } w->func = fib6_dump_node; cb->args[2] = (long)w; /* 2. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); err = -ENOENT; goto unlock; } if (!cb->args[0]) { err = fib6_dump_table(tb, skb, cb); if (!err) cb->args[0] = 1; } goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; err = fib6_dump_table(tb, skb, cb); if (err != 0) goto out; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); if (err <= 0) fib6_dump_end(cb); return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match) { int cpu; if (!fib6_nh->rt6i_pcpu) return; rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); /* Paired with xchg() in rt6_get_pcpu_route() */ pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } rcu_read_unlock(); } static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_info *arg = _arg; __fib6_drop_pcpu_from(nh, arg); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { rcu_read_lock(); nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i); rcu_read_unlock(); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt); if (rt->nh) { spin_lock(&rt->nh->lock); if (!list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); spin_unlock(&rt->nh->lock); } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } fib6_clean_expires(rt); fib6_remove_gc_list(rt); } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack, struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) WRITE_ONCE(rt->fib6_nsiblings, 0); if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); fib6_remove_gc_list(iter); } else { fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) WRITE_ONCE(rt->fib6_nsiblings, rt->fib6_nsiblings + 1); } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail_rcu(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings + 1); BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rcu_read_lock(); rt6_multipath_rebalance(temp_sibling); rcu_read_unlock(); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings - 1); WRITE_ONCE(rt->fib6_nsiblings, 0); list_del_rcu(&rt->fib6_siblings); rcu_read_lock(); rt6_multipath_rebalance(next_sibling); rcu_read_unlock(); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack, struct list_head *purge_list) { int err; spin_lock(&rt->nh->lock); if (rt->nh->dead) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -EINVAL; } else { err = fib6_add_rt2node(fn, rt, info, extack, purge_list); if (!err) list_add(&rt->nh_list, &rt->nh->f6i_list); } spin_unlock(&rt->nh->lock); return err; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; LIST_HEAD(purge_list); struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; #endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } #ifdef CONFIG_IPV6_SUBTREES pn = fn; if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif if (rt->nh) err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); else err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { struct fib6_info *iter, *next; list_for_each_entry_safe(iter, next, &purge_list, purge_link) { list_del(&iter->purge_link); fib6_purge_rt(iter, fn, info->nl_net); fib6_info_release(iter); } __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) fib6_add_gc_list(rt); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) WRITE_ONCE(sibling->fib6_nsiblings, sibling->fib6_nsiblings - 1); WRITE_ONCE(rt->fib6_nsiblings, 0); list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } static void fib6_gc_table(struct net *net, struct fib6_table *tb6, struct fib6_gc_args *gc_args) { struct fib6_info *rt; struct hlist_node *n; struct nl_info info = { .nl_net = net, .skip_notify = false, }; hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) if (fib6_age(rt, gc_args) == -1) fib6_del(rt, &info); } static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_gc_table(net, table, gc_args); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; spin_lock_init(&net->ipv6.fib_table_hash_lock); net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; timer_delete_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .dumpit = inet6_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = KMEM_CACHE(fib6_node, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */
27 27 14 11 16 27 27 18 18 10 12 16 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0 /* * direct.c - Low-level direct PCI config space access */ #include <linux/pci.h> #include <linux/init.h> #include <linux/dmi.h> #include <asm/pci_x86.h> /* * Functions for accessing PCI base (first 256 bytes) and extended * (4096 bytes per PCI function) configuration space with type 1 * accesses. */ #define PCI_CONF1_ADDRESS(bus, devfn, reg) \ (0x80000000 | ((reg & 0xF00) << 16) | (bus << 16) \ | (devfn << 8) | (reg & 0xFC)) static int pci_conf1_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) { *value = -1; return -EINVAL; } raw_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); switch (len) { case 1: *value = inb(0xCFC + (reg & 3)); break; case 2: *value = inw(0xCFC + (reg & 2)); break; case 4: *value = inl(0xCFC); break; } raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } static int pci_conf1_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { unsigned long flags; if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; raw_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); switch (len) { case 1: outb((u8)value, 0xCFC + (reg & 3)); break; case 2: outw((u16)value, 0xCFC + (reg & 2)); break; case 4: outl((u32)value, 0xCFC); break; } raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } #undef PCI_CONF1_ADDRESS const struct pci_raw_ops pci_direct_conf1 = { .read = pci_conf1_read, .write = pci_conf1_write, }; /* * Functions for accessing PCI configuration space with type 2 accesses */ #define PCI_CONF2_ADDRESS(dev, reg) (u16)(0xC000 | (dev << 8) | reg) static int pci_conf2_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; int dev, fn; WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) { *value = -1; return -EINVAL; } dev = PCI_SLOT(devfn); fn = PCI_FUNC(devfn); if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; raw_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); switch (len) { case 1: *value = inb(PCI_CONF2_ADDRESS(dev, reg)); break; case 2: *value = inw(PCI_CONF2_ADDRESS(dev, reg)); break; case 4: *value = inl(PCI_CONF2_ADDRESS(dev, reg)); break; } outb(0, 0xCF8); raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } static int pci_conf2_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { unsigned long flags; int dev, fn; WARN_ON(seg); if ((bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; dev = PCI_SLOT(devfn); fn = PCI_FUNC(devfn); if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; raw_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); switch (len) { case 1: outb((u8)value, PCI_CONF2_ADDRESS(dev, reg)); break; case 2: outw((u16)value, PCI_CONF2_ADDRESS(dev, reg)); break; case 4: outl((u32)value, PCI_CONF2_ADDRESS(dev, reg)); break; } outb(0, 0xCF8); raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } #undef PCI_CONF2_ADDRESS static const struct pci_raw_ops pci_direct_conf2 = { .read = pci_conf2_read, .write = pci_conf2_write, }; /* * Before we decide to use direct hardware access mechanisms, we try to do some * trivial checks to ensure it at least _seems_ to be working -- we just test * whether bus 00 contains a host bridge (this is similar to checking * techniques used in XFree86, but ours should be more reliable since we * attempt to make use of direct access hints provided by the PCI BIOS). * * This should be close to trivial, but it isn't, because there are buggy * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID. */ static int __init pci_sanity_check(const struct pci_raw_ops *o) { u32 x = 0; int devfn; if (pci_probe & PCI_NO_CHECKS) return 1; /* Assume Type 1 works for newer systems. This handles machines that don't have anything on PCI Bus 0. */ if (dmi_get_bios_year() >= 2001) return 1; for (devfn = 0; devfn < 0x100; devfn++) { if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x)) continue; if (x == PCI_CLASS_BRIDGE_HOST || x == PCI_CLASS_DISPLAY_VGA) return 1; if (o->read(0, 0, devfn, PCI_VENDOR_ID, 2, &x)) continue; if (x == PCI_VENDOR_ID_INTEL || x == PCI_VENDOR_ID_COMPAQ) return 1; } DBG(KERN_WARNING "PCI: Sanity check failed\n"); return 0; } static int __init pci_check_type1(void) { unsigned long flags; unsigned int tmp; int works = 0; local_irq_save(flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { works = 1; } outl(tmp, 0xCF8); local_irq_restore(flags); return works; } static int __init pci_check_type2(void) { unsigned long flags; int works = 0; local_irq_save(flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && pci_sanity_check(&pci_direct_conf2)) { works = 1; } local_irq_restore(flags); return works; } void __init pci_direct_init(int type) { if (type == 0) return; printk(KERN_INFO "PCI: Using configuration type %d for base access\n", type); if (type == 1) { raw_pci_ops = &pci_direct_conf1; if (raw_pci_ext_ops) return; if (!(pci_probe & PCI_HAS_IO_ECS)) return; printk(KERN_INFO "PCI: Using configuration type 1 " "for extended access\n"); raw_pci_ext_ops = &pci_direct_conf1; return; } raw_pci_ops = &pci_direct_conf2; } int __init pci_direct_probe(void) { if ((pci_probe & PCI_PROBE_CONF1) == 0) goto type2; if (!request_region(0xCF8, 8, "PCI conf1")) goto type2; if (pci_check_type1()) { raw_pci_ops = &pci_direct_conf1; port_cf9_safe = true; return 1; } release_region(0xCF8, 8); type2: if ((pci_probe & PCI_PROBE_CONF2) == 0) return 0; if (!request_region(0xCF8, 4, "PCI conf2")) return 0; if (!request_region(0xC000, 0x1000, "PCI conf2")) goto fail2; if (pci_check_type2()) { raw_pci_ops = &pci_direct_conf2; port_cf9_safe = true; return 2; } release_region(0xC000, 0x1000); fail2: release_region(0xCF8, 4); return 0; }
1 1 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * super.c */ /* * This file implements code to read the superblock, read and initialise * in-memory structures at mount time, and all the VFS glue code to register * the filesystem. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/vfs.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/module.h> #include <linux/magic.h> #include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" #include "xattr.h" static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; enum Opt_errors { Opt_errors_continue, Opt_errors_panic, }; enum squashfs_param { Opt_errors, Opt_threads, }; struct squashfs_mount_opts { enum Opt_errors errors; const struct squashfs_decompressor_thread_ops *thread_ops; int thread_num; }; static const struct constant_table squashfs_param_errors[] = { {"continue", Opt_errors_continue }, {"panic", Opt_errors_panic }, {} }; static const struct fs_parameter_spec squashfs_fs_parameters[] = { fsparam_enum("errors", Opt_errors, squashfs_param_errors), fsparam_string("threads", Opt_threads), {} }; static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts) { #ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT if (strcmp(str, "single") == 0) { opts->thread_ops = &squashfs_decompressor_single; return 0; } if (strcmp(str, "multi") == 0) { opts->thread_ops = &squashfs_decompressor_multi; return 0; } if (strcmp(str, "percpu") == 0) { opts->thread_ops = &squashfs_decompressor_percpu; return 0; } #endif return -EINVAL; } static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts) { #ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS int ret; unsigned long num; ret = kstrtoul(str, 0, &num); if (ret != 0) return -EINVAL; if (num > 1) { opts->thread_ops = &squashfs_decompressor_multi; if (num > opts->thread_ops->max_decompressors()) return -EINVAL; opts->thread_num = (int)num; return 0; } #ifdef CONFIG_SQUASHFS_DECOMP_SINGLE if (num == 1) { opts->thread_ops = &squashfs_decompressor_single; opts->thread_num = 1; return 0; } #endif #endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */ return -EINVAL; } static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) { int ret = squashfs_parse_param_threads_str(str, opts); if (ret == 0) return ret; return squashfs_parse_param_threads_num(str, opts); } static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct squashfs_mount_opts *opts = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, squashfs_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_errors: opts->errors = result.uint_32; break; case Opt_threads: if (squashfs_parse_param_threads(param->string, opts) != 0) return -EINVAL; break; default: return -EINVAL; } return 0; } static const struct squashfs_decompressor *supported_squashfs_filesystem( struct fs_context *fc, short major, short minor, short id) { const struct squashfs_decompressor *decompressor; if (major < SQUASHFS_MAJOR) { errorf(fc, "Major/Minor mismatch, older Squashfs %d.%d " "filesystems are unsupported", major, minor); return NULL; } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { errorf(fc, "Major/Minor mismatch, trying to mount newer " "%d.%d filesystem", major, minor); errorf(fc, "Please update your kernel"); return NULL; } decompressor = squashfs_lookup_decompressor(id); if (!decompressor->supported) { errorf(fc, "Filesystem uses \"%s\" compression. This is not supported", decompressor->name); return NULL; } return decompressor; } static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct squashfs_mount_opts *opts = fc->fs_private; struct squashfs_sb_info *msblk; struct squashfs_super_block *sblk = NULL; struct inode *root; long long root_inode; unsigned short flags; unsigned int fragments; u64 lookup_table_start, xattr_id_table_start, next_table; int err, devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); TRACE("Entered squashfs_fill_superblock\n"); if (!devblksize) { errorf(fc, "squashfs: unable to set blocksize\n"); return -EINVAL; } sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); return -ENOMEM; } msblk = sb->s_fs_info; msblk->thread_ops = opts->thread_ops; msblk->panic_on_errors = (opts->errors == Opt_errors_panic); msblk->devblksize = devblksize; msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); /* * msblk->bytes_used is checked in squashfs_read_table to ensure reads * are not beyond filesystem end. But as we're using * squashfs_read_table here to read the superblock (including the value * of bytes_used) we need to set it to an initial sensible dummy value */ msblk->bytes_used = sizeof(*sblk); sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk)); if (IS_ERR(sblk)) { errorf(fc, "unable to read squashfs_super_block"); err = PTR_ERR(sblk); sblk = NULL; goto failed_mount; } err = -EINVAL; /* Check it is a SQUASHFS superblock */ sb->s_magic = le32_to_cpu(sblk->s_magic); if (sb->s_magic != SQUASHFS_MAGIC) { if (!(fc->sb_flags & SB_SILENT)) errorf(fc, "Can't find a SQUASHFS superblock on %pg", sb->s_bdev); goto failed_mount; } if (opts->thread_num == 0) { msblk->max_thread_num = msblk->thread_ops->max_decompressors(); } else { msblk->max_thread_num = opts->thread_num; } /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( fc, le16_to_cpu(sblk->s_major), le16_to_cpu(sblk->s_minor), le16_to_cpu(sblk->compression)); if (msblk->decompressor == NULL) goto failed_mount; /* Check the filesystem does not extend beyond the end of the block device */ msblk->bytes_used = le64_to_cpu(sblk->bytes_used); if (msblk->bytes_used < 0 || msblk->bytes_used > bdev_nr_bytes(sb->s_bdev)) goto failed_mount; /* Check block size for sanity */ msblk->block_size = le32_to_cpu(sblk->block_size); if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) goto insanity; /* * Check the system page size is not larger than the filesystem * block size (by default 128K). This is currently not supported. */ if (PAGE_SIZE > msblk->block_size) { errorf(fc, "Page size > filesystem block size (%d). This is " "currently not supported!", msblk->block_size); goto failed_mount; } /* Check block log for sanity */ msblk->block_log = le16_to_cpu(sblk->block_log); if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) goto failed_mount; /* Check that block_size and block_log match */ if (msblk->block_size != (1 << msblk->block_log)) goto insanity; /* Check the root inode for sanity */ root_inode = le64_to_cpu(sblk->root_inode); if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) goto insanity; msblk->inode_table = le64_to_cpu(sblk->inode_table_start); msblk->directory_table = le64_to_cpu(sblk->directory_table_start); msblk->inodes = le32_to_cpu(sblk->inodes); msblk->fragments = le32_to_cpu(sblk->fragments); msblk->ids = le16_to_cpu(sblk->no_ids); flags = le16_to_cpu(sblk->flags); TRACE("Found valid superblock on %pg\n", sb->s_bdev); TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) ? "un" : ""); TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) ? "un" : ""); TRACE("Filesystem size %lld bytes\n", msblk->bytes_used); TRACE("Block size %d\n", msblk->block_size); TRACE("Number of inodes %d\n", msblk->inodes); TRACE("Number of fragments %d\n", msblk->fragments); TRACE("Number of ids %d\n", msblk->ids); TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); TRACE("sblk->fragment_table_start %llx\n", (u64) le64_to_cpu(sblk->fragment_table_start)); TRACE("sblk->id_table_start %llx\n", (u64) le64_to_cpu(sblk->id_table_start)); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_min = 0; sb->s_time_max = U32_MAX; sb->s_flags |= SB_RDONLY; sb->s_op = &squashfs_super_ops; msblk->block_cache = squashfs_cache_init("metadata", SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); if (IS_ERR(msblk->block_cache)) { err = PTR_ERR(msblk->block_cache); goto failed_mount; } /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", SQUASHFS_READ_PAGES, msblk->block_size); if (IS_ERR(msblk->read_page)) { errorf(fc, "Failed to allocate read_page block"); err = PTR_ERR(msblk->read_page); goto failed_mount; } if (msblk->devblksize == PAGE_SIZE) { struct inode *cache = new_inode(sb); if (cache == NULL) { err = -ENOMEM; goto failed_mount; } set_nlink(cache, 1); cache->i_size = OFFSET_MAX; mapping_set_gfp_mask(cache->i_mapping, GFP_NOFS); msblk->cache_mapping = cache->i_mapping; } msblk->stream = squashfs_decompressor_setup(sb, flags); if (IS_ERR(msblk->stream)) { err = PTR_ERR(msblk->stream); msblk->stream = NULL; goto insanity; } /* Handle xattrs */ sb->s_xattr = squashfs_xattr_handlers; xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start); if (xattr_id_table_start == SQUASHFS_INVALID_BLK) { next_table = msblk->bytes_used; goto allocate_id_index_table; } /* Allocate and read xattr id lookup table */ msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); if (IS_ERR(msblk->xattr_id_table)) { errorf(fc, "unable to read xattr id index table"); err = PTR_ERR(msblk->xattr_id_table); msblk->xattr_id_table = NULL; if (err != -ENOTSUPP) goto failed_mount; } next_table = msblk->xattr_table; allocate_id_index_table: /* Allocate and read id index table */ msblk->id_table = squashfs_read_id_index_table(sb, le64_to_cpu(sblk->id_table_start), next_table, msblk->ids); if (IS_ERR(msblk->id_table)) { errorf(fc, "unable to read id index table"); err = PTR_ERR(msblk->id_table); msblk->id_table = NULL; goto failed_mount; } next_table = le64_to_cpu(msblk->id_table[0]); /* Handle inode lookup table */ lookup_table_start = le64_to_cpu(sblk->lookup_table_start); if (lookup_table_start == SQUASHFS_INVALID_BLK) goto handle_fragments; /* Allocate and read inode lookup table */ msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, lookup_table_start, next_table, msblk->inodes); if (IS_ERR(msblk->inode_lookup_table)) { errorf(fc, "unable to read inode lookup table"); err = PTR_ERR(msblk->inode_lookup_table); msblk->inode_lookup_table = NULL; goto failed_mount; } next_table = le64_to_cpu(msblk->inode_lookup_table[0]); sb->s_export_op = &squashfs_export_ops; handle_fragments: fragments = msblk->fragments; if (fragments == 0) goto check_directory_table; msblk->fragment_cache = squashfs_cache_init("fragment", min(SQUASHFS_CACHED_FRAGMENTS, fragments), msblk->block_size); if (IS_ERR(msblk->fragment_cache)) { err = PTR_ERR(msblk->fragment_cache); goto failed_mount; } /* Allocate and read fragment index table */ msblk->fragment_index = squashfs_read_fragment_index_table(sb, le64_to_cpu(sblk->fragment_table_start), next_table, fragments); if (IS_ERR(msblk->fragment_index)) { errorf(fc, "unable to read fragment index table"); err = PTR_ERR(msblk->fragment_index); msblk->fragment_index = NULL; goto failed_mount; } next_table = le64_to_cpu(msblk->fragment_index[0]); check_directory_table: /* Sanity check directory_table */ if (msblk->directory_table > next_table) { err = -EINVAL; goto insanity; } /* Sanity check inode_table */ if (msblk->inode_table >= msblk->directory_table) { err = -EINVAL; goto insanity; } /* allocate root */ root = new_inode(sb); if (!root) { err = -ENOMEM; goto failed_mount; } err = squashfs_read_inode(root, root_inode); if (err) { make_bad_inode(root); iput(root); goto failed_mount; } insert_inode_hash(root); sb->s_root = d_make_root(root); if (sb->s_root == NULL) { ERROR("Root inode create failed\n"); err = -ENOMEM; goto failed_mount; } TRACE("Leaving squashfs_fill_super\n"); kfree(sblk); return 0; insanity: errorf(fc, "squashfs image failed sanity check"); failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); if (msblk->cache_mapping) iput(msblk->cache_mapping->host); msblk->thread_ops->destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); kfree(msblk->xattr_id_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; kfree(sblk); return err; } static int squashfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, squashfs_fill_super); } static int squashfs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; struct squashfs_mount_opts *opts = fc->fs_private; sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; msblk->panic_on_errors = (opts->errors == Opt_errors_panic); return 0; } static void squashfs_free_fs_context(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations squashfs_context_ops = { .get_tree = squashfs_get_tree, .free = squashfs_free_fs_context, .parse_param = squashfs_parse_param, .reconfigure = squashfs_reconfigure, }; static int squashfs_show_options(struct seq_file *s, struct dentry *root) { struct super_block *sb = root->d_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; if (msblk->panic_on_errors) seq_puts(s, ",errors=panic"); else seq_puts(s, ",errors=continue"); #ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT if (msblk->thread_ops == &squashfs_decompressor_single) { seq_puts(s, ",threads=single"); return 0; } if (msblk->thread_ops == &squashfs_decompressor_percpu) { seq_puts(s, ",threads=percpu"); return 0; } #endif #ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS seq_printf(s, ",threads=%d", msblk->max_thread_num); #endif return 0; } static int squashfs_init_fs_context(struct fs_context *fc) { struct squashfs_mount_opts *opts; opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; #ifdef CONFIG_SQUASHFS_DECOMP_SINGLE opts->thread_ops = &squashfs_decompressor_single; #elif defined(CONFIG_SQUASHFS_DECOMP_MULTI) opts->thread_ops = &squashfs_decompressor_multi; #elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) opts->thread_ops = &squashfs_decompressor_percpu; #else #error "fail: unknown squashfs decompression thread mode?" #endif opts->thread_num = 0; fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; } static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); TRACE("Entered squashfs_statfs\n"); buf->f_type = SQUASHFS_MAGIC; buf->f_bsize = msblk->block_size; buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1; buf->f_bfree = buf->f_bavail = 0; buf->f_files = msblk->inodes; buf->f_ffree = 0; buf->f_namelen = SQUASHFS_NAME_LEN; buf->f_fsid = u64_to_fsid(id); return 0; } static void squashfs_put_super(struct super_block *sb) { if (sb->s_fs_info) { struct squashfs_sb_info *sbi = sb->s_fs_info; squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); if (sbi->cache_mapping) iput(sbi->cache_mapping->host); sbi->thread_ops->destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); kfree(sbi->inode_lookup_table); kfree(sbi->xattr_id_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } } static struct kmem_cache *squashfs_inode_cachep; static void init_once(void *foo) { struct squashfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", sizeof(struct squashfs_inode_info), 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, init_once); return squashfs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(squashfs_inode_cachep); } static int __init init_squashfs_fs(void) { int err = init_inodecache(); if (err) return err; err = register_filesystem(&squashfs_fs_type); if (err) { destroy_inodecache(); return err; } pr_info("version 4.0 (2009/01/31) Phillip Lougher\n"); return 0; } static void __exit exit_squashfs_fs(void) { unregister_filesystem(&squashfs_fs_type); destroy_inodecache(); } static struct inode *squashfs_alloc_inode(struct super_block *sb) { struct squashfs_inode_info *ei = alloc_inode_sb(sb, squashfs_inode_cachep, GFP_KERNEL); return ei ? &ei->vfs_inode : NULL; } static void squashfs_free_inode(struct inode *inode) { kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); } static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, .name = "squashfs", .init_fs_context = squashfs_init_fs_context, .parameters = squashfs_fs_parameters, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("squashfs"); static const struct super_operations squashfs_super_ops = { .alloc_inode = squashfs_alloc_inode, .free_inode = squashfs_free_inode, .statfs = squashfs_statfs, .put_super = squashfs_put_super, .show_options = squashfs_show_options, }; module_init(init_squashfs_fs); module_exit(exit_squashfs_fs); MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); MODULE_AUTHOR("Phillip Lougher <phillip@squashfs.org.uk>"); MODULE_LICENSE("GPL");
1 1 1 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/ufs/super.c * * Copyright (C) 1998 * Daniel Pirkl <daniel.pirkl@email.cz> * Charles University, Faculty of Mathematics and Physics */ /* Derived from * * linux/fs/ext2/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ /* * Inspired by * * linux/fs/ufs/super.c * * Copyright (C) 1996 * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu) * Laboratory for Computer Science Research Computing Facility * Rutgers, The State University of New Jersey * * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) * * Kernel module support added on 96/04/26 by * Stefan Reinauer <stepan@home.culture.mipt.ru> * * Module usage counts added on 96/04/29 by * Gertjan van Wingerde <gwingerde@gmail.com> * * Clean swab support on 19970406 by * Francois-Rene Rideau <fare@tunes.org> * * 4.4BSD (FreeBSD) support added on February 1st 1998 by * Niels Kristian Bech Jensen <nkbj@image.dk> partially based * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>. * * NeXTstep support added on February 5th 1998 by * Niels Kristian Bech Jensen <nkbj@image.dk>. * * write support Daniel Pirkl <daniel.pirkl@email.cz> 1998 * * HP/UX hfs filesystem support added by * Martin K. Petersen <mkp@mkp.net>, August 1999 * * UFS2 (of FreeBSD 5.x) support added by * Niraj Kumar <niraj17@iitbombay.org>, Jan 2004 * * UFS2 write support added by * Evgeniy Dushistov <dushistov@mail.ru>, 2007 */ #include <linux/exportfs.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/stdarg.h> #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/log2.h> #include <linux/seq_file.h> #include <linux/iversion.h> #include "ufs_fs.h" #include "ufs.h" #include "swab.h" #include "util.h" static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct inode *inode; if (ino < UFS_ROOTINO || ino > (u64)uspi->s_ncg * uspi->s_ipg) return ERR_PTR(-ESTALE); inode = ufs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); } static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); } static struct dentry *ufs_get_parent(struct dentry *child) { ino_t ino; ino = ufs_inode_by_name(d_inode(child), &dotdot_name); if (!ino) return ERR_PTR(-ENOENT); return d_obtain_alias(ufs_iget(child->d_sb, ino)); } static const struct export_operations ufs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ufs_fh_to_dentry, .fh_to_parent = ufs_fh_to_parent, .get_parent = ufs_get_parent, }; #ifdef CONFIG_UFS_DEBUG /* * Print contents of ufs_super_block, useful for debugging */ static void ufs_print_super_stuff(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_second *usb2, struct ufs_super_block_third *usb3) { u32 magic = fs32_to_cpu(sb, usb3->fs_magic); pr_debug("ufs_print_super_stuff\n"); pr_debug(" magic: 0x%x\n", magic); if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { pr_debug(" fs_size: %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); pr_debug(" fs_dsize: %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); pr_debug(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); pr_debug(" fs_sblockloc: %llu\n", (unsigned long long) fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); pr_debug(" cs_ndir(No of dirs): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); pr_debug(" cs_nbfree(No of free blocks): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); pr_info(" cs_nifree(Num of free inodes): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); pr_info(" cs_nffree(Num of free frags): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); pr_info(" fs_maxsymlinklen: %u\n", fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); } else { pr_debug(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); pr_debug(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); pr_debug(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); pr_debug(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); pr_debug(" cgoffset: %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset)); pr_debug(" ~cgmask: 0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask)); pr_debug(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); pr_debug(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); pr_debug(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); pr_debug(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); pr_debug(" fragshift: %u\n", fs32_to_cpu(sb, usb1->fs_fragshift)); pr_debug(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); pr_debug(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); pr_debug(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); pr_debug(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); pr_debug(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); pr_debug(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); pr_debug(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); pr_debug(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); pr_debug(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); pr_debug(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); pr_debug(" fstodb: %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb)); pr_debug(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); pr_debug(" ndir %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); pr_debug(" nifree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); pr_debug(" nbfree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); pr_debug(" nffree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); } pr_debug("\n"); } /* * Print contents of ufs_cylinder_group, useful for debugging */ static void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg) { pr_debug("\nufs_print_cylinder_stuff\n"); pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); pr_debug(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); pr_debug(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); pr_debug(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); pr_debug(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); pr_debug(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); pr_debug(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); pr_debug(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); pr_debug(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); pr_debug(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); pr_debug(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); pr_debug(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); pr_debug(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); pr_debug(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); pr_debug(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]), fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]), fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]), fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7])); pr_debug(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); pr_debug(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); pr_debug(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); pr_debug(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); pr_debug(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); pr_debug(" clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); pr_debug(" clusteroff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); pr_debug(" nclusterblks %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); pr_debug("\n"); } #else # define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ # define ufs_print_cylinder_stuff(sb, cg) /**/ #endif /* CONFIG_UFS_DEBUG */ static const struct super_operations ufs_super_ops; void ufs_error (struct super_block * sb, const char * function, const char * fmt, ...) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); if (!sb_rdonly(sb)) { usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); sb->s_flags |= SB_RDONLY; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; switch (UFS_SB(sb)->s_on_err) { case UFS_MOUNT_ONERROR_PANIC: panic("panic (device %s): %s: %pV\n", sb->s_id, function, &vaf); case UFS_MOUNT_ONERROR_LOCK: case UFS_MOUNT_ONERROR_UMOUNT: case UFS_MOUNT_ONERROR_REPAIR: pr_crit("error (device %s): %s: %pV\n", sb->s_id, function, &vaf); } va_end(args); } void ufs_panic (struct super_block * sb, const char * function, const char * fmt, ...) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); if (!sb_rdonly(sb)) { usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; sb->s_flags |= SB_RDONLY; pr_crit("panic (device %s): %s: %pV\n", sb->s_id, function, &vaf); va_end(args); } void ufs_warning (struct super_block * sb, const char * function, const char * fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_warn("(device %s): %s: %pV\n", sb->s_id, function, &vaf); va_end(args); } enum { Opt_type, Opt_onerror }; static const struct constant_table ufs_param_ufstype[] = { {"old", UFS_MOUNT_UFSTYPE_OLD}, {"sunx86", UFS_MOUNT_UFSTYPE_SUNx86}, {"sun", UFS_MOUNT_UFSTYPE_SUN}, {"sunos", UFS_MOUNT_UFSTYPE_SUNOS}, {"44bsd", UFS_MOUNT_UFSTYPE_44BSD}, {"ufs2", UFS_MOUNT_UFSTYPE_UFS2}, {"5xbsd", UFS_MOUNT_UFSTYPE_UFS2}, {"hp", UFS_MOUNT_UFSTYPE_HP}, {"nextstep-cd", UFS_MOUNT_UFSTYPE_NEXTSTEP_CD}, {"nextstep", UFS_MOUNT_UFSTYPE_NEXTSTEP}, {"openstep", UFS_MOUNT_UFSTYPE_OPENSTEP}, {} }; static const struct constant_table ufs_param_onerror[] = { {"panic", UFS_MOUNT_ONERROR_PANIC}, {"lock", UFS_MOUNT_ONERROR_LOCK}, {"umount", UFS_MOUNT_ONERROR_UMOUNT}, {"repair", UFS_MOUNT_ONERROR_REPAIR}, {} }; static const struct fs_parameter_spec ufs_param_spec[] = { fsparam_enum ("ufstype", Opt_type, ufs_param_ufstype), fsparam_enum ("onerror", Opt_onerror, ufs_param_onerror), {} }; struct ufs_fs_context { unsigned int flavour, on_err; }; static int ufs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ufs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; UFSD("ENTER\n"); opt = fs_parse(fc, ufs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_type: if (ctx->flavour == result.uint_32) /* no-op */ return 0; if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { pr_err("ufstype can't be changed during remount\n"); return -EINVAL; } if (ctx->flavour) { pr_err("conflicting ufstype options\n"); return -EINVAL; } ctx->flavour = result.uint_32; break; case Opt_onerror: ctx->on_err = result.uint_32; break; default: return -EINVAL; } return 0; } /* * Different types of UFS hold fs_cstotal in different * places, and use different data structure for it. * To make things simpler we just copy fs_cstotal to ufs_sb_private_info */ static void ufs_setup_cstotal(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; unsigned mtype = sbi->s_flavour; UFSD("ENTER, mtype=%u\n", mtype); usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) || mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir); uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree); uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree); uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree); } else { uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir); uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree); uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree); uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree); } UFSD("EXIT\n"); } /* * Read on-disk structures associated with cylinder groups */ static int ufs_read_cylinder_structures(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; unsigned char * base, * space; unsigned size, blks, i; UFSD("ENTER\n"); /* * Read cs structures from (usually) first data block * on the device. */ size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; base = space = kmalloc(size, GFP_NOFS); if (!base) goto failed; sbi->s_csp = (struct ufs_csum *)space; for (i = 0; i < blks; i++) { struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i); if (!bh) goto failed; memcpy(space, bh->b_data, uspi->s_fsize); space += uspi->s_fsize; brelse (bh); } /* * Read cylinder group (we read only first fragment from block * at this time) and prepare internal data structures for cg caching. */ sbi->s_ucg = kmalloc_array(uspi->s_ncg, sizeof(struct buffer_head *), GFP_NOFS); if (!sbi->s_ucg) goto failed; for (i = 0; i < uspi->s_ncg; i++) sbi->s_ucg[i] = NULL; for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { sbi->s_ucpi[i] = NULL; sbi->s_cgno[i] = UFS_CGNO_EMPTY; } for (i = 0; i < uspi->s_ncg; i++) { UFSD("read cg %u\n", i); if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i)))) goto failed; if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data)) goto failed; ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); } for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS))) goto failed; sbi->s_cgno[i] = UFS_CGNO_EMPTY; } sbi->s_cg_loaded = 0; UFSD("EXIT\n"); return 1; failed: kfree (base); if (sbi->s_ucg) { for (i = 0; i < uspi->s_ncg; i++) if (sbi->s_ucg[i]) brelse (sbi->s_ucg[i]); kfree (sbi->s_ucg); for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) kfree (sbi->s_ucpi[i]); } UFSD("EXIT (FAILED)\n"); return 0; } /* * Sync our internal copy of fs_cstotal with disk */ static void ufs_put_cstotal(struct super_block *sb) { unsigned mtype = UFS_SB(sb)->s_flavour; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; UFSD("ENTER\n"); usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); if (mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ usb2->fs_un.fs_u2.cs_ndir = cpu_to_fs64(sb, uspi->cs_total.cs_ndir); usb2->fs_un.fs_u2.cs_nbfree = cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); usb3->fs_un1.fs_u2.cs_nifree = cpu_to_fs64(sb, uspi->cs_total.cs_nifree); usb3->fs_un1.fs_u2.cs_nffree = cpu_to_fs64(sb, uspi->cs_total.cs_nffree); goto out; } if (mtype == UFS_MOUNT_UFSTYPE_44BSD && (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) { /* store stats in both old and new places */ usb2->fs_un.fs_u2.cs_ndir = cpu_to_fs64(sb, uspi->cs_total.cs_ndir); usb2->fs_un.fs_u2.cs_nbfree = cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); usb3->fs_un1.fs_u2.cs_nifree = cpu_to_fs64(sb, uspi->cs_total.cs_nifree); usb3->fs_un1.fs_u2.cs_nffree = cpu_to_fs64(sb, uspi->cs_total.cs_nffree); } usb1->fs_cstotal.cs_ndir = cpu_to_fs32(sb, uspi->cs_total.cs_ndir); usb1->fs_cstotal.cs_nbfree = cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); usb1->fs_cstotal.cs_nifree = cpu_to_fs32(sb, uspi->cs_total.cs_nifree); usb1->fs_cstotal.cs_nffree = cpu_to_fs32(sb, uspi->cs_total.cs_nffree); out: ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_print_super_stuff(sb, usb1, usb2, usb3); UFSD("EXIT\n"); } /** * ufs_put_super_internal() - put on-disk intrenal structures * @sb: pointer to super_block structure * Put on-disk structures associated with cylinder groups * and write them back to disk, also update cs_total on disk */ static void ufs_put_super_internal(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; unsigned char * base, * space; unsigned blks, size, i; UFSD("ENTER\n"); ufs_put_cstotal(sb); size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; base = space = (char*) sbi->s_csp; for (i = 0; i < blks; i++, space += uspi->s_fsize) { struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i); if (unlikely(!bh)) { // better than an oops... ufs_panic(sb, __func__, "can't write part of cylinder group summary"); continue; } memcpy(bh->b_data, space, uspi->s_fsize); mark_buffer_dirty(bh); brelse(bh); } for (i = 0; i < sbi->s_cg_loaded; i++) { ufs_put_cylinder (sb, i); kfree (sbi->s_ucpi[i]); } for (; i < UFS_MAX_GROUP_LOADED; i++) kfree (sbi->s_ucpi[i]); for (i = 0; i < uspi->s_ncg; i++) brelse (sbi->s_ucg[i]); kfree (sbi->s_ucg); kfree (base); UFSD("EXIT\n"); } static int ufs_sync_fs(struct super_block *sb, int wait) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_third * usb3; unsigned flags; mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); flags = UFS_SB(sb)->s_flags; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); usb1->fs_time = ufs_get_seconds(sb); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); ufs_put_cstotal(sb); UFSD("EXIT\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } static void delayed_sync_fs(struct work_struct *work) { struct ufs_sb_info *sbi; sbi = container_of(work, struct ufs_sb_info, sync_work.work); spin_lock(&sbi->work_lock); sbi->work_queued = 0; spin_unlock(&sbi->work_lock); ufs_sync_fs(sbi->sb, 1); } void ufs_mark_sb_dirty(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); unsigned long delay; spin_lock(&sbi->work_lock); if (!sbi->work_queued) { delay = msecs_to_jiffies(dirty_writeback_interval * 10); queue_delayed_work(system_long_wq, &sbi->sync_work, delay); sbi->work_queued = 1; } spin_unlock(&sbi->work_lock); } static void ufs_put_super(struct super_block *sb) { struct ufs_sb_info * sbi = UFS_SB(sb); UFSD("ENTER\n"); if (!sb_rdonly(sb)) ufs_put_super_internal(sb); cancel_delayed_work_sync(&sbi->sync_work); ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); return; } static u64 ufs_max_bytes(struct super_block *sb) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; int bits = uspi->s_apbshift; u64 res; if (bits > 21) res = ~0ULL; else res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) + (1LL << (3*bits)); if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift)) return MAX_LFS_FILESIZE; return res << uspi->s_bshift; } static int ufs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ufs_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; struct ufs_sb_info * sbi; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_second * usb2; struct ufs_super_block_third * usb3; struct ufs_buffer_head * ubh; struct inode *inode; unsigned block_size, super_block_size; unsigned flags; unsigned super_block_offset; unsigned maxsymlen; int ret = -EINVAL; uspi = NULL; ubh = NULL; flags = 0; UFSD("ENTER\n"); #ifndef CONFIG_UFS_FS_WRITE if (!sb_rdonly(sb)) { pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); return -EROFS; } #endif sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); if (!sbi) goto failed_nomem; sb->s_fs_info = sbi; sbi->sb = sb; UFSD("flag %u\n", (int)(sb_rdonly(sb))); mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); sbi->s_flavour = ctx->flavour; sbi->s_on_err = ctx->on_err; if (!sbi->s_flavour) { if (!silent) pr_err("You didn't specify the type of your ufs filesystem\n\n" "mount -t ufs -o ufstype=" "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " "default is ufstype=old\n"); sbi->s_flavour = UFS_MOUNT_UFSTYPE_OLD; } uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL); sbi->s_uspi = uspi; if (!uspi) goto failed; uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = NSEC_PER_SEC; sb->s_time_min = S32_MIN; sb->s_time_max = S32_MAX; switch (sbi->s_flavour) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); uspi->s_fsize = block_size = 512; uspi->s_fmask = ~(512 - 1); uspi->s_fshift = 9; uspi->s_sbsize = super_block_size = 1536; uspi->s_sbbase = 0; flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; break; case UFS_MOUNT_UFSTYPE_UFS2: UFSD("ufstype=ufs2\n"); super_block_offset=SBLOCK_UFS2; uspi->s_fsize = block_size = 512; uspi->s_fmask = ~(512 - 1); uspi->s_fshift = 9; uspi->s_sbsize = super_block_size = 1536; uspi->s_sbbase = 0; sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; break; case UFS_MOUNT_UFSTYPE_SUN: UFSD("ufstype=sun\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_maxsymlinklen = 0; /* Not supported on disk */ flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUN | UFS_CG_SUN; break; case UFS_MOUNT_UFSTYPE_SUNOS: UFSD("ufstype=sunos\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = 2048; super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_maxsymlinklen = 0; /* Not supported on disk */ flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_SUNOS | UFS_CG_SUN; break; case UFS_MOUNT_UFSTYPE_SUNx86: UFSD("ufstype=sunx86\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_maxsymlinklen = 0; /* Not supported on disk */ flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUNx86 | UFS_CG_SUN; break; case UFS_MOUNT_UFSTYPE_OLD: UFSD("ufstype=old\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=old is supported read-only\n"); sb->s_flags |= SB_RDONLY; } break; case UFS_MOUNT_UFSTYPE_NEXTSTEP: UFSD("ufstype=nextstep\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep is supported read-only\n"); sb->s_flags |= SB_RDONLY; } break; case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD: UFSD("ufstype=nextstep-cd\n"); uspi->s_fsize = block_size = 2048; uspi->s_fmask = ~(2048 - 1); uspi->s_fshift = 11; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep-cd is supported read-only\n"); sb->s_flags |= SB_RDONLY; } break; case UFS_MOUNT_UFSTYPE_OPENSTEP: UFSD("ufstype=openstep\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=openstep is supported read-only\n"); sb->s_flags |= SB_RDONLY; } break; case UFS_MOUNT_UFSTYPE_HP: UFSD("ufstype=hp\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=hp is supported read-only\n"); sb->s_flags |= SB_RDONLY; } break; default: if (!silent) pr_err("unknown ufstype\n"); goto failed; } again: if (!sb_set_blocksize(sb, block_size)) { pr_err("failed to set blocksize\n"); goto failed; } /* * read ufs super block from device */ ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + super_block_offset/block_size, super_block_size); if (!ubh) goto failed; usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); /* Sort out mod used on SunOS 4.1.3 for fs_state */ uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat); if (((flags & UFS_ST_MASK) == UFS_ST_SUNOS) && (uspi->s_postblformat != UFS_42POSTBLFMT)) { flags &= ~UFS_ST_MASK; flags |= UFS_ST_SUN; } if ((flags & UFS_ST_MASK) == UFS_ST_44BSD && uspi->s_postblformat == UFS_42POSTBLFMT) { if (!silent) pr_err("this is not a 44bsd filesystem"); goto failed; } /* * Check ufs magic number */ sbi->s_bytesex = BYTESEX_LE; switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { case UFS_MAGIC: case UFS_MAGIC_BW: case UFS2_MAGIC: case UFS_MAGIC_LFN: case UFS_MAGIC_FEA: case UFS_MAGIC_4GB: goto magic_found; } sbi->s_bytesex = BYTESEX_BE; switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { case UFS_MAGIC: case UFS_MAGIC_BW: case UFS2_MAGIC: case UFS_MAGIC_LFN: case UFS_MAGIC_FEA: case UFS_MAGIC_4GB: goto magic_found; } if ((sbi->s_flavour == UFS_MOUNT_UFSTYPE_NEXTSTEP || sbi->s_flavour == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD || sbi->s_flavour == UFS_MOUNT_UFSTYPE_OPENSTEP) && uspi->s_sbbase < 256) { ubh_brelse_uspi(uspi); ubh = NULL; uspi->s_sbbase += 8; goto again; } if (!silent) pr_err("%s(): bad magic number\n", __func__); goto failed; magic_found: /* * Check block and fragment sizes */ uspi->s_bsize = fs32_to_cpu(sb, usb1->fs_bsize); uspi->s_fsize = fs32_to_cpu(sb, usb1->fs_fsize); uspi->s_sbsize = fs32_to_cpu(sb, usb1->fs_sbsize); uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); if (!is_power_of_2(uspi->s_fsize)) { pr_err("%s(): fragment size %u is not a power of 2\n", __func__, uspi->s_fsize); goto failed; } if (uspi->s_fsize < 512) { pr_err("%s(): fragment size %u is too small\n", __func__, uspi->s_fsize); goto failed; } if (uspi->s_fsize > 4096) { pr_err("%s(): fragment size %u is too large\n", __func__, uspi->s_fsize); goto failed; } if (!is_power_of_2(uspi->s_bsize)) { pr_err("%s(): block size %u is not a power of 2\n", __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize < 4096) { pr_err("%s(): block size %u is too small\n", __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize / uspi->s_fsize > 8) { pr_err("%s(): too many fragments per block (%u)\n", __func__, uspi->s_bsize / uspi->s_fsize); goto failed; } if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) { ubh_brelse_uspi(uspi); ubh = NULL; block_size = uspi->s_fsize; super_block_size = uspi->s_sbsize; UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size); goto again; } sbi->s_flags = flags;/*after that line some functions use s_flags*/ ufs_print_super_stuff(sb, usb1, usb2, usb3); /* * Check, if file system was correctly unmounted. * If not, make it read only. */ if (((flags & UFS_ST_MASK) == UFS_ST_44BSD) || ((flags & UFS_ST_MASK) == UFS_ST_OLD) || (((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) && (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) { switch(usb1->fs_clean) { case UFS_FSCLEAN: UFSD("fs is clean\n"); break; case UFS_FSSTABLE: UFSD("fs is stable\n"); break; case UFS_FSLOG: UFSD("fs is logging fs\n"); break; case UFS_FSOSF1: UFSD("fs is DEC OSF/1\n"); break; case UFS_FSACTIVE: pr_err("%s(): fs is active\n", __func__); sb->s_flags |= SB_RDONLY; break; case UFS_FSBAD: pr_err("%s(): fs is bad\n", __func__); sb->s_flags |= SB_RDONLY; break; default: pr_err("%s(): can't grok fs_clean 0x%x\n", __func__, usb1->fs_clean); sb->s_flags |= SB_RDONLY; break; } } else { pr_err("%s(): fs needs fsck\n", __func__); sb->s_flags |= SB_RDONLY; } /* * Read ufs_super_block into internal data structures */ sb->s_op = &ufs_super_ops; sb->s_export_op = &ufs_export_ops; sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); uspi->s_cblkno = fs32_to_cpu(sb, usb1->fs_cblkno); uspi->s_iblkno = fs32_to_cpu(sb, usb1->fs_iblkno); uspi->s_dblkno = fs32_to_cpu(sb, usb1->fs_dblkno); uspi->s_cgoffset = fs32_to_cpu(sb, usb1->fs_cgoffset); uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { uspi->s_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); uspi->s_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); } else { uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); } uspi->s_ncg = fs32_to_cpu(sb, usb1->fs_ncg); /* s_bsize already set */ /* s_fsize already set */ uspi->s_fpb = fs32_to_cpu(sb, usb1->fs_frag); uspi->s_minfree = fs32_to_cpu(sb, usb1->fs_minfree); uspi->s_bmask = fs32_to_cpu(sb, usb1->fs_bmask); uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift); uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift, uspi->s_fshift); uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift); uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb); /* s_sbsize already set */ uspi->s_csmask = fs32_to_cpu(sb, usb1->fs_csmask); uspi->s_csshift = fs32_to_cpu(sb, usb1->fs_csshift); uspi->s_nindir = fs32_to_cpu(sb, usb1->fs_nindir); uspi->s_inopb = fs32_to_cpu(sb, usb1->fs_inopb); uspi->s_nspf = fs32_to_cpu(sb, usb1->fs_nspf); uspi->s_npsect = ufs_get_fs_npsect(sb, usb1, usb3); uspi->s_interleave = fs32_to_cpu(sb, usb1->fs_interleave); uspi->s_trackskew = fs32_to_cpu(sb, usb1->fs_trackskew); if (uspi->fs_magic == UFS2_MAGIC) uspi->s_csaddr = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr); else uspi->s_csaddr = fs32_to_cpu(sb, usb1->fs_csaddr); uspi->s_cssize = fs32_to_cpu(sb, usb1->fs_cssize); uspi->s_cgsize = fs32_to_cpu(sb, usb1->fs_cgsize); uspi->s_ntrak = fs32_to_cpu(sb, usb1->fs_ntrak); uspi->s_nsect = fs32_to_cpu(sb, usb1->fs_nsect); uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc); uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg); uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg); uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc); uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize); uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3); uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3); uspi->s_nrpos = fs32_to_cpu(sb, usb3->fs_nrpos); uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff); uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff); uspi->s_root_blocks = mul_u64_u32_div(uspi->s_dsize, uspi->s_minfree, 100); if (uspi->s_minfree <= 5) { uspi->s_time_to_space = ~0ULL; uspi->s_space_to_time = 0; usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE); } else { uspi->s_time_to_space = (uspi->s_root_blocks / 2) + 1; uspi->s_space_to_time = mul_u64_u32_div(uspi->s_dsize, uspi->s_minfree - 2, 100) - 1; } /* * Compute another frequently used values */ uspi->s_fpbmask = uspi->s_fpb - 1; if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) uspi->s_apbshift = uspi->s_bshift - 3; else uspi->s_apbshift = uspi->s_bshift - 2; uspi->s_apb = 1 << uspi->s_apbshift; uspi->s_apbmask = uspi->s_apb - 1; uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS; uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift; uspi->s_inopf = uspi->s_inopb >> uspi->s_fpbshift; uspi->s_bpf = uspi->s_fsize << 3; uspi->s_bpfshift = uspi->s_fshift + 3; uspi->s_bpfmask = uspi->s_bpf - 1; if (sbi->s_flavour == UFS_MOUNT_UFSTYPE_44BSD || sbi->s_flavour == UFS_MOUNT_UFSTYPE_UFS2) uspi->s_maxsymlinklen = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen); if (uspi->fs_magic == UFS2_MAGIC) maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR); else maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR); if (uspi->s_maxsymlinklen > maxsymlen) { ufs_warning(sb, __func__, "ufs_read_super: excessive maximum " "fast symlink size (%u)\n", uspi->s_maxsymlinklen); uspi->s_maxsymlinklen = maxsymlen; } sb->s_maxbytes = ufs_max_bytes(sb); sb->s_max_links = UFS_LINK_MAX; inode = ufs_iget(sb, UFS_ROOTINO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto failed; } sb->s_root = d_make_root(inode); if (!sb->s_root) { ret = -ENOMEM; goto failed; } ufs_setup_cstotal(sb); /* * Read cylinder group structures */ if (!sb_rdonly(sb)) if (!ufs_read_cylinder_structures(sb)) goto failed; UFSD("EXIT\n"); return 0; failed: if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); kfree(sbi); sb->s_fs_info = NULL; UFSD("EXIT (FAILED)\n"); return ret; failed_nomem: UFSD("EXIT (NOMEM)\n"); return -ENOMEM; } static int ufs_reconfigure(struct fs_context *fc) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_third * usb3; struct ufs_fs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; unsigned int ufstype; unsigned int flags; sync_filesystem(sb); mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); ufstype = UFS_SB(sb)->s_flavour; if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) { UFS_SB(sb)->s_on_err = ctx->on_err; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } /* * fs was mouted as rw, remounting ro */ if (fc->sb_flags & SB_RDONLY) { ufs_put_super_internal(sb); usb1->fs_time = ufs_get_seconds(sb); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); ubh_mark_buffer_dirty (USPI_UBH(uspi)); sb->s_flags |= SB_RDONLY; } else { /* * fs was mounted as ro, remounting rw */ #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && ufstype != UFS_MOUNT_UFSTYPE_SUNOS && ufstype != UFS_MOUNT_UFSTYPE_44BSD && ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return -EPERM; } sb->s_flags &= ~SB_RDONLY; #endif } UFS_SB(sb)->s_on_err = ctx->on_err; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } static int ufs_show_options(struct seq_file *seq, struct dentry *root) { struct ufs_sb_info *sbi = UFS_SB(root->d_sb); unsigned mval = sbi->s_flavour; const struct constant_table *tp; tp = ufs_param_ufstype; while (tp->value && tp->value != mval) ++tp; seq_printf(seq, ",ufstype=%s", tp->name); tp = ufs_param_onerror; mval = sbi->s_on_err; while (tp->value && tp->value != mval) ++tp; seq_printf(seq, ",onerror=%s", tp->name); return 0; } static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; unsigned flags = UFS_SB(sb)->s_flags; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); mutex_lock(&UFS_SB(sb)->s_lock); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) buf->f_type = UFS2_MAGIC; else buf->f_type = UFS_MAGIC; buf->f_blocks = uspi->s_dsize; buf->f_bfree = ufs_freefrags(uspi); buf->f_ffree = uspi->cs_total.cs_nifree; buf->f_bsize = sb->s_blocksize; buf->f_bavail = (buf->f_bfree > uspi->s_root_blocks) ? (buf->f_bfree - uspi->s_root_blocks) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; buf->f_fsid = u64_to_fsid(id); mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } static struct kmem_cache * ufs_inode_cachep; static struct inode *ufs_alloc_inode(struct super_block *sb) { struct ufs_inode_info *ei; ei = alloc_inode_sb(sb, ufs_inode_cachep, GFP_NOFS); if (!ei) return NULL; inode_set_iversion(&ei->vfs_inode, 1); seqlock_init(&ei->meta_lock); mutex_init(&ei->truncate_mutex); return &ei->vfs_inode; } static void ufs_free_in_core_inode(struct inode *inode) { kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); } static void init_once(void *foo) { struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { ufs_inode_cachep = kmem_cache_create_usercopy("ufs_inode_cache", sizeof(struct ufs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT), offsetof(struct ufs_inode_info, i_u1.i_symlink), sizeof_field(struct ufs_inode_info, i_u1.i_symlink), init_once); if (ufs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ufs_inode_cachep); } static const struct super_operations ufs_super_ops = { .alloc_inode = ufs_alloc_inode, .free_inode = ufs_free_in_core_inode, .write_inode = ufs_write_inode, .evict_inode = ufs_evict_inode, .put_super = ufs_put_super, .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, .show_options = ufs_show_options, }; static int ufs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ufs_fill_super); } static void ufs_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations ufs_context_ops = { .parse_param = ufs_parse_param, .get_tree = ufs_get_tree, .reconfigure = ufs_reconfigure, .free = ufs_free_fc, }; static int ufs_init_fs_context(struct fs_context *fc) { struct ufs_fs_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { struct super_block *sb = fc->root->d_sb; struct ufs_sb_info *sbi = UFS_SB(sb); ctx->flavour = sbi->s_flavour; ctx->on_err = sbi->s_on_err; } else { ctx->flavour = 0; ctx->on_err = UFS_MOUNT_ONERROR_LOCK; } fc->fs_private = ctx; fc->ops = &ufs_context_ops; return 0; } static struct file_system_type ufs_fs_type = { .owner = THIS_MODULE, .name = "ufs", .kill_sb = kill_block_super, .init_fs_context = ufs_init_fs_context, .parameters = ufs_param_spec, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ufs"); static int __init init_ufs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&ufs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_ufs_fs(void) { unregister_filesystem(&ufs_fs_type); destroy_inodecache(); } module_init(init_ufs_fs) module_exit(exit_ufs_fs) MODULE_DESCRIPTION("UFS Filesystem"); MODULE_LICENSE("GPL");
18 18 61 61 60 62 61 60 60 18 18 18 62 43 43 43 43 43 43 43 43 42 43 43 43 43 43 43 43 62 18 18 18 18 44 42 62 61 61 44 43 43 43 18 44 43 43 62 59 62 62 43 43 43 43 18 18 18 18 18 18 18 18 18 23 23 22 23 5 18 23 5 5 18 19 18 18 18 18 31 31 43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 // SPDX-License-Identifier: GPL-2.0-or-later /* * Scatterlist Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 David S. Miller (davem@redhat.com) * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> * and Nettle, by Niels Möller. */ #include <linux/err.h> #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/module.h> #include <linux/param.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/completion.h> #include "internal.h" LIST_HEAD(crypto_alg_list); EXPORT_SYMBOL_GPL(crypto_alg_list); DECLARE_RWSEM(crypto_alg_sem); EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); #if IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) && IS_ENABLED(CONFIG_CRYPTO_SELFTESTS) DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); #endif static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg, u32 type, u32 mask); static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type, u32 mask); struct crypto_alg *crypto_mod_get(struct crypto_alg *alg) { return try_module_get(alg->cra_module) ? crypto_alg_get(alg) : NULL; } EXPORT_SYMBOL_GPL(crypto_mod_get); void crypto_mod_put(struct crypto_alg *alg) { struct module *module = alg->cra_module; crypto_alg_put(alg); module_put(module); } EXPORT_SYMBOL_GPL(crypto_mod_put); static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *q, *alg = NULL; int best = -2; list_for_each_entry(q, &crypto_alg_list, cra_list) { int exact, fuzzy; if (crypto_is_moribund(q)) continue; if ((q->cra_flags ^ type) & mask) continue; exact = !strcmp(q->cra_driver_name, name); fuzzy = !strcmp(q->cra_name, name); if (!exact && !(fuzzy && q->cra_priority > best)) continue; if (unlikely(!crypto_mod_get(q))) continue; best = q->cra_priority; if (alg) crypto_mod_put(alg); alg = q; if (exact) break; } return alg; } static void crypto_larval_destroy(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; BUG_ON(!crypto_is_larval(alg)); if (!IS_ERR_OR_NULL(larval->adult)) crypto_mod_put(larval->adult); kfree(larval); } struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask) { struct crypto_larval *larval; larval = kzalloc(sizeof(*larval), GFP_KERNEL); if (!larval) return ERR_PTR(-ENOMEM); type &= ~CRYPTO_ALG_TYPE_MASK | (mask ?: CRYPTO_ALG_TYPE_MASK); larval->mask = mask; larval->alg.cra_flags = CRYPTO_ALG_LARVAL | type; larval->alg.cra_priority = -1; larval->alg.cra_destroy = crypto_larval_destroy; strscpy(larval->alg.cra_name, name, CRYPTO_MAX_ALG_NAME); init_completion(&larval->completion); return larval; } EXPORT_SYMBOL_GPL(crypto_larval_alloc); static struct crypto_alg *crypto_larval_add(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_larval *larval; larval = crypto_larval_alloc(name, type, mask); if (IS_ERR(larval)) return ERR_CAST(larval); refcount_set(&larval->alg.cra_refcnt, 2); down_write(&crypto_alg_sem); alg = __crypto_alg_lookup(name, type, mask); if (!alg) { alg = &larval->alg; list_add(&alg->cra_list, &crypto_alg_list); } up_write(&crypto_alg_sem); if (alg != &larval->alg) { kfree(larval); if (crypto_is_larval(alg)) alg = crypto_larval_wait(alg, type, mask); } return alg; } static void crypto_larval_kill(struct crypto_larval *larval) { bool unlinked; down_write(&crypto_alg_sem); unlinked = list_empty(&larval->alg.cra_list); if (!unlinked) list_del_init(&larval->alg.cra_list); up_write(&crypto_alg_sem); if (unlinked) return; complete_all(&larval->completion); crypto_alg_put(&larval->alg); } void crypto_schedule_test(struct crypto_larval *larval) { int err; err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult); WARN_ON_ONCE(err != NOTIFY_STOP); } EXPORT_SYMBOL_GPL(crypto_schedule_test); static void crypto_start_test(struct crypto_larval *larval) { if (!crypto_is_test_larval(larval)) return; if (larval->test_started) return; down_write(&crypto_alg_sem); if (larval->test_started) { up_write(&crypto_alg_sem); return; } larval->test_started = true; up_write(&crypto_alg_sem); crypto_schedule_test(larval); } static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg, u32 type, u32 mask) { struct crypto_larval *larval; long time_left; again: larval = container_of(alg, struct crypto_larval, alg); if (!crypto_boot_test_finished()) crypto_start_test(larval); time_left = wait_for_completion_killable_timeout( &larval->completion, 60 * HZ); alg = larval->adult; if (time_left < 0) alg = ERR_PTR(-EINTR); else if (!time_left) { if (crypto_is_test_larval(larval)) crypto_larval_kill(larval); alg = ERR_PTR(-ETIMEDOUT); } else if (!alg || PTR_ERR(alg) == -EEXIST) { int err = alg ? -EEXIST : -EAGAIN; /* * EEXIST is expected because two probes can be scheduled * at the same time with one using alg_name and the other * using driver_name. Do a re-lookup but do not retry in * case we hit a quirk like gcm_base(ctr(aes),...) which * will never match. */ alg = &larval->alg; alg = crypto_alg_lookup(alg->cra_name, type, mask) ?: ERR_PTR(err); } else if (IS_ERR(alg)) ; else if (crypto_is_test_larval(larval) && !(alg->cra_flags & CRYPTO_ALG_TESTED)) alg = ERR_PTR(-EAGAIN); else if (alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL) alg = ERR_PTR(-EAGAIN); else if (!crypto_mod_get(alg)) alg = ERR_PTR(-EAGAIN); crypto_mod_put(&larval->alg); if (!IS_ERR(alg) && crypto_is_larval(alg)) goto again; return alg; } static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type, u32 mask) { const u32 fips = CRYPTO_ALG_FIPS_INTERNAL; struct crypto_alg *alg; u32 test = 0; if (!((type | mask) & CRYPTO_ALG_TESTED)) test |= CRYPTO_ALG_TESTED; down_read(&crypto_alg_sem); alg = __crypto_alg_lookup(name, (type | test) & ~fips, (mask | test) & ~fips); if (alg) { if (((type | mask) ^ fips) & fips) mask |= fips; mask &= fips; if (!crypto_is_larval(alg) && ((type ^ alg->cra_flags) & mask)) { /* Algorithm is disallowed in FIPS mode. */ crypto_mod_put(alg); alg = ERR_PTR(-ENOENT); } } else if (test) { alg = __crypto_alg_lookup(name, type, mask); if (alg && !crypto_is_larval(alg)) { /* Test failed */ crypto_mod_put(alg); alg = ERR_PTR(-ELIBBAD); } } up_read(&crypto_alg_sem); return alg; } static struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; if (!name) return ERR_PTR(-ENOENT); type &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); alg = crypto_alg_lookup(name, type, mask); if (!alg && !(mask & CRYPTO_NOLOAD)) { request_module("crypto-%s", name); if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask & CRYPTO_ALG_NEED_FALLBACK)) request_module("crypto-%s-all", name); alg = crypto_alg_lookup(name, type, mask); } if (!IS_ERR_OR_NULL(alg) && crypto_is_larval(alg)) alg = crypto_larval_wait(alg, type, mask); else if (alg) ; else if (!(mask & CRYPTO_ALG_TESTED)) alg = crypto_larval_add(name, type, mask); else alg = ERR_PTR(-ENOENT); return alg; } int crypto_probing_notify(unsigned long val, void *v) { int ok; ok = blocking_notifier_call_chain(&crypto_chain, val, v); if (ok == NOTIFY_DONE) { request_module("cryptomgr"); ok = blocking_notifier_call_chain(&crypto_chain, val, v); } return ok; } EXPORT_SYMBOL_GPL(crypto_probing_notify); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_alg *larval; int ok; /* * If the internal flag is set for a cipher, require a caller to * invoke the cipher with the internal flag to use that cipher. * Also, if a caller wants to allocate a cipher that may or may * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and * !(mask & CRYPTO_ALG_INTERNAL). */ if (!((type | mask) & CRYPTO_ALG_INTERNAL)) mask |= CRYPTO_ALG_INTERNAL; larval = crypto_larval_lookup(name, type, mask); if (IS_ERR(larval) || !crypto_is_larval(larval)) return larval; ok = crypto_probing_notify(CRYPTO_MSG_ALG_REQUEST, larval); if (ok == NOTIFY_STOP) alg = crypto_larval_wait(larval, type, mask); else { crypto_mod_put(larval); alg = ERR_PTR(-ENOENT); } crypto_larval_kill(container_of(larval, struct crypto_larval, alg)); return alg; } EXPORT_SYMBOL_GPL(crypto_alg_mod_lookup); static void crypto_exit_ops(struct crypto_tfm *tfm) { const struct crypto_type *type = tfm->__crt_alg->cra_type; if (type && tfm->exit) tfm->exit(tfm); } static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask) { const struct crypto_type *type_obj = alg->cra_type; unsigned int len; len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1); if (type_obj) return len + type_obj->ctxsize(alg, type, mask); switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) { default: BUG(); case CRYPTO_ALG_TYPE_CIPHER: len += crypto_cipher_ctxsize(alg); break; } return len; } void crypto_shoot_alg(struct crypto_alg *alg) { down_write(&crypto_alg_sem); alg->cra_flags |= CRYPTO_ALG_DYING; up_write(&crypto_alg_sem); } EXPORT_SYMBOL_GPL(crypto_shoot_alg); struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type, u32 mask, gfp_t gfp) { struct crypto_tfm *tfm; unsigned int tfm_size; int err = -ENOMEM; tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask); tfm = kzalloc(tfm_size, gfp); if (tfm == NULL) goto out_err; tfm->__crt_alg = alg; refcount_set(&tfm->refcnt, 1); if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; goto out; cra_init_failed: crypto_exit_ops(tfm); if (err == -EAGAIN) crypto_shoot_alg(alg); kfree(tfm); out_err: tfm = ERR_PTR(err); out: return tfm; } EXPORT_SYMBOL_GPL(__crypto_alloc_tfmgfp); struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, u32 mask) { return __crypto_alloc_tfmgfp(alg, type, mask, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__crypto_alloc_tfm); /* * crypto_alloc_base - Locate algorithm and allocate transform * @alg_name: Name of algorithm * @type: Type of algorithm * @mask: Mask for type comparison * * This function should not be used by new algorithm types. * Please use crypto_alloc_tfm instead. * * crypto_alloc_base() will first attempt to locate an already loaded * algorithm. If that fails and the kernel supports dynamically loadable * modules, it will then attempt to load a module of the same name or * alias. If that fails it will send a query to any loaded crypto manager * to construct an algorithm on the fly. A refcount is grabbed on the * algorithm which is then associated with the new transform. * * The returned transform is of a non-determinate type. Most people * should use one of the more specific allocation functions such as * crypto_alloc_skcipher(). * * In case of error the return value is an error pointer. */ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask) { struct crypto_tfm *tfm; int err; for (;;) { struct crypto_alg *alg; alg = crypto_alg_mod_lookup(alg_name, type, mask); if (IS_ERR(alg)) { err = PTR_ERR(alg); goto err; } tfm = __crypto_alloc_tfm(alg, type, mask); if (!IS_ERR(tfm)) return tfm; crypto_mod_put(alg); err = PTR_ERR(tfm); err: if (err != -EAGAIN) break; if (fatal_signal_pending(current)) { err = -EINTR; break; } } return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_base); static void *crypto_alloc_tfmmem(struct crypto_alg *alg, const struct crypto_type *frontend, int node, gfp_t gfp) { struct crypto_tfm *tfm; unsigned int tfmsize; unsigned int total; char *mem; tfmsize = frontend->tfmsize; total = tfmsize + sizeof(*tfm) + frontend->extsize(alg); mem = kzalloc_node(total, gfp, node); if (mem == NULL) return ERR_PTR(-ENOMEM); tfm = (struct crypto_tfm *)(mem + tfmsize); tfm->__crt_alg = alg; tfm->node = node; refcount_set(&tfm->refcnt, 1); return mem; } void *crypto_create_tfm_node(struct crypto_alg *alg, const struct crypto_type *frontend, int node) { struct crypto_tfm *tfm; char *mem; int err; mem = crypto_alloc_tfmmem(alg, frontend, node, GFP_KERNEL); if (IS_ERR(mem)) goto out; tfm = (struct crypto_tfm *)(mem + frontend->tfmsize); tfm->fb = tfm; err = frontend->init_tfm(tfm); if (err) goto out_free_tfm; if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; goto out; cra_init_failed: crypto_exit_ops(tfm); out_free_tfm: if (err == -EAGAIN) crypto_shoot_alg(alg); kfree(mem); mem = ERR_PTR(err); out: return mem; } EXPORT_SYMBOL_GPL(crypto_create_tfm_node); void *crypto_clone_tfm(const struct crypto_type *frontend, struct crypto_tfm *otfm) { struct crypto_alg *alg = otfm->__crt_alg; struct crypto_tfm *tfm; char *mem; mem = ERR_PTR(-ESTALE); if (unlikely(!crypto_mod_get(alg))) goto out; mem = crypto_alloc_tfmmem(alg, frontend, otfm->node, GFP_ATOMIC); if (IS_ERR(mem)) { crypto_mod_put(alg); goto out; } tfm = (struct crypto_tfm *)(mem + frontend->tfmsize); tfm->crt_flags = otfm->crt_flags; tfm->fb = tfm; out: return mem; } EXPORT_SYMBOL_GPL(crypto_clone_tfm); struct crypto_alg *crypto_find_alg(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask) { if (frontend) { type &= frontend->maskclear; mask &= frontend->maskclear; type |= frontend->type; mask |= frontend->maskset; } return crypto_alg_mod_lookup(alg_name, type, mask); } EXPORT_SYMBOL_GPL(crypto_find_alg); /* * crypto_alloc_tfm_node - Locate algorithm and allocate transform * @alg_name: Name of algorithm * @frontend: Frontend algorithm type * @type: Type of algorithm * @mask: Mask for type comparison * @node: NUMA node in which users desire to put requests, if node is * NUMA_NO_NODE, it means users have no special requirement. * * crypto_alloc_tfm() will first attempt to locate an already loaded * algorithm. If that fails and the kernel supports dynamically loadable * modules, it will then attempt to load a module of the same name or * alias. If that fails it will send a query to any loaded crypto manager * to construct an algorithm on the fly. A refcount is grabbed on the * algorithm which is then associated with the new transform. * * The returned transform is of a non-determinate type. Most people * should use one of the more specific allocation functions such as * crypto_alloc_skcipher(). * * In case of error the return value is an error pointer. */ void *crypto_alloc_tfm_node(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask, int node) { void *tfm; int err; for (;;) { struct crypto_alg *alg; alg = crypto_find_alg(alg_name, frontend, type, mask); if (IS_ERR(alg)) { err = PTR_ERR(alg); goto err; } tfm = crypto_create_tfm_node(alg, frontend, node); if (!IS_ERR(tfm)) return tfm; crypto_mod_put(alg); err = PTR_ERR(tfm); err: if (err != -EAGAIN) break; if (fatal_signal_pending(current)) { err = -EINTR; break; } } return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_tfm_node); /* * crypto_destroy_tfm - Free crypto transform * @mem: Start of tfm slab * @tfm: Transform to free * * This function frees up the transform and any associated resources, * then drops the refcount on the associated algorithm. */ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm) { struct crypto_alg *alg; if (IS_ERR_OR_NULL(mem)) return; if (!refcount_dec_and_test(&tfm->refcnt)) return; alg = tfm->__crt_alg; if (!tfm->exit && alg->cra_exit) alg->cra_exit(tfm); crypto_exit_ops(tfm); crypto_mod_put(alg); kfree_sensitive(mem); } EXPORT_SYMBOL_GPL(crypto_destroy_tfm); int crypto_has_alg(const char *name, u32 type, u32 mask) { int ret = 0; struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask); if (!IS_ERR(alg)) { crypto_mod_put(alg); ret = 1; } return ret; } EXPORT_SYMBOL_GPL(crypto_has_alg); void crypto_req_done(void *data, int err) { struct crypto_wait *wait = data; if (err == -EINPROGRESS) return; wait->err = err; complete(&wait->completion); } EXPORT_SYMBOL_GPL(crypto_req_done); void crypto_destroy_alg(struct crypto_alg *alg) { if (alg->cra_type && alg->cra_type->destroy) alg->cra_type->destroy(alg); if (alg->cra_destroy) alg->cra_destroy(alg); } EXPORT_SYMBOL_GPL(crypto_destroy_alg); struct crypto_async_request *crypto_request_clone( struct crypto_async_request *req, size_t total, gfp_t gfp) { struct crypto_tfm *tfm = req->tfm; struct crypto_async_request *nreq; nreq = kmemdup(req, total, gfp); if (!nreq) { req->tfm = tfm->fb; return req; } nreq->flags &= ~CRYPTO_TFM_REQ_ON_STACK; return nreq; } EXPORT_SYMBOL_GPL(crypto_request_clone); MODULE_DESCRIPTION("Cryptographic core API"); MODULE_LICENSE("GPL");
1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/sysctl.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/if_ether.h> /* For the statistics structure. */ #include <linux/slab.h> #include <linux/uaccess.h> #include <asm/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> /* * Only allow IP over NET/ROM frames through if the netrom device is up. */ int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; if (!netif_running(dev)) { stats->rx_dropped++; return 0; } stats->rx_packets++; stats->rx_bytes += skb->len; skb->protocol = htons(ETH_P_IP); /* Spoof incoming device */ skb->dev = dev; skb->mac_header = skb->network_header; skb_reset_network_header(skb); skb->pkt_type = PACKET_HOST; netif_rx(skb); return 1; } static int nr_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { unsigned char *buff = skb_push(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); memcpy(buff, (saddr != NULL) ? saddr : dev->dev_addr, dev->addr_len); buff[6] &= ~AX25_CBIT; buff[6] &= ~AX25_EBIT; buff[6] |= AX25_SSSID_SPARE; buff += AX25_ADDR_LEN; if (daddr != NULL) memcpy(buff, daddr, dev->addr_len); buff[6] &= ~AX25_CBIT; buff[6] |= AX25_EBIT; buff[6] |= AX25_SSSID_SPARE; buff += AX25_ADDR_LEN; *buff++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); *buff++ = NR_PROTO_IP; *buff++ = NR_PROTO_IP; *buff++ = 0; *buff++ = 0; *buff++ = NR_PROTOEXT; if (daddr != NULL) return 37; return -37; } static int __must_check nr_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; int err; if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) return 0; if (dev->flags & IFF_UP) { err = ax25_listen_register((ax25_address *)sa->sa_data, NULL); if (err) return err; ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); } dev_addr_set(dev, sa->sa_data); return 0; } static int nr_open(struct net_device *dev) { int err; err = ax25_listen_register((const ax25_address *)dev->dev_addr, NULL); if (err) return err; netif_start_queue(dev); return 0; } static int nr_close(struct net_device *dev) { ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); netif_stop_queue(dev); return 0; } static netdev_tx_t nr_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len = skb->len; if (!nr_route_frame(skb, NULL)) { kfree_skb(skb); stats->tx_errors++; return NETDEV_TX_OK; } stats->tx_packets++; stats->tx_bytes += len; return NETDEV_TX_OK; } static const struct header_ops nr_header_ops = { .create = nr_header, }; static const struct net_device_ops nr_netdev_ops = { .ndo_open = nr_open, .ndo_stop = nr_close, .ndo_start_xmit = nr_xmit, .ndo_set_mac_address = nr_set_mac_address, }; void nr_setup(struct net_device *dev) { dev->mtu = NR_MAX_PACKET_SIZE; dev->netdev_ops = &nr_netdev_ops; dev->header_ops = &nr_header_ops; dev->hard_header_len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; dev->addr_len = AX25_ADDR_LEN; dev->type = ARPHRD_NETROM; /* New-style flags. */ dev->flags = IFF_NOARP; }
6 3 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/if_team.h - Network team device driver header * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ #include <linux/netpoll.h> #include <net/sch_generic.h> #include <linux/types.h> #include <uapi/linux/if_team.h> struct team_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; u32 rx_nohandler; }; struct team; struct team_port { struct net_device *dev; struct hlist_node hlist; /* node in enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; int index; /* index of enabled port. If disabled, it's set to -1 */ bool linkup; /* either state.linkup or user.linkup */ struct { bool linkup; u32 speed; u8 duplex; } state; /* Values set by userspace */ struct { bool linkup; bool linkup_enabled; } user; /* Custom gennetlink interface related flags */ bool changed; bool removed; /* * A place for storing original values of the device before it * become a port. */ struct { unsigned char dev_addr[MAX_ADDR_LEN]; unsigned int mtu; } orig; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif s32 priority; /* lower number ~ higher priority */ u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; } static inline bool team_port_txable(struct team_port *port) { return port->linkup && team_port_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) { struct team_port *port; bool txable; rcu_read_lock(); port = team_port_get_rcu(port_dev); txable = port ? team_port_txable(port) : false; rcu_read_unlock(); return txable; } #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { } #endif struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); rx_handler_result_t (*receive)(struct team *team, struct team_port *port, struct sk_buff *skb); bool (*transmit)(struct team *team, struct sk_buff *skb); int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); extern void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port); enum team_option_type { TEAM_OPTION_TYPE_U32, TEAM_OPTION_TYPE_STRING, TEAM_OPTION_TYPE_BINARY, TEAM_OPTION_TYPE_BOOL, TEAM_OPTION_TYPE_S32, }; struct team_option_inst_info { u32 array_index; struct team_port *port; /* != NULL if per-port */ }; struct team_gsetter_ctx { union { u32 u32_val; const char *str_val; struct { const void *ptr; u32 len; } bin_val; bool bool_val; s32 s32_val; } data; struct team_option_inst_info *info; }; struct team_option { struct list_head list; const char *name; bool per_port; unsigned int array_size; /* != 0 means the option is array */ enum team_option_type type; void (*init)(struct team *team, struct team_option_inst_info *info); void (*getter)(struct team *team, struct team_gsetter_ctx *ctx); int (*setter)(struct team *team, struct team_gsetter_ctx *ctx); }; extern void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info); extern void team_options_change_check(struct team *team); struct team_mode { const char *kind; struct module *owner; size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 #define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS) #define TEAM_MODE_PRIV_LONGS 4 #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; /* * List of enabled ports and their count */ int en_port_count; struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ struct list_head option_list; struct list_head option_inst_list; /* list of option instances */ const struct team_mode *mode; struct team_mode_ops ops; bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } notify_peers; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; if (unlikely(netpoll_tx_running(team->dev))) { team_netpoll_send_skb(port, skb); return 0; } return dev_queue_xmit(skb); } static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; } static inline struct team_port *team_get_port_by_index(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; return num % en_port_count; } static inline struct team_port *team_get_port_by_index_rcu(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline struct team_port * team_get_first_port_txable_rcu(struct team *team, struct team_port *port) { struct team_port *cur; if (likely(team_port_txable(port))) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; if (team_port_txable(cur)) return cur; } return NULL; } extern int team_options_register(struct team *team, const struct team_option *option, size_t option_count); extern void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count); extern int team_mode_register(const struct team_mode *mode); extern void team_mode_unregister(const struct team_mode *mode); #define TEAM_DEFAULT_NUM_TX_QUEUES 16 #define TEAM_DEFAULT_NUM_RX_QUEUES 16 #define MODULE_ALIAS_TEAM_MODE(kind) MODULE_ALIAS("team-mode-" kind) #endif /* _LINUX_IF_TEAM_H_ */
2 2 2 2 2 3 3 3 3 2 2 5 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 /* SPDX-License-Identifier: GPL-2.0-only */ /* * User-mode machine state access * * Copyright (C) 2007 Red Hat, Inc. All rights reserved. * * Red Hat Author: Roland McGrath. */ #ifndef _LINUX_REGSET_H #define _LINUX_REGSET_H 1 #include <linux/compiler.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/uaccess.h> struct task_struct; struct user_regset; struct membuf { void *p; size_t left; }; static inline int membuf_zero(struct membuf *s, size_t size) { if (s->left) { if (size > s->left) size = s->left; memset(s->p, 0, size); s->p += size; s->left -= size; } return s->left; } static inline int membuf_write(struct membuf *s, const void *v, size_t size) { if (s->left) { if (size > s->left) size = s->left; memcpy(s->p, v, size); s->p += size; s->left -= size; } return s->left; } static inline struct membuf membuf_at(const struct membuf *s, size_t offs) { struct membuf n = *s; if (offs > n.left) offs = n.left; n.p += offs; n.left -= offs; return n; } /* current s->p must be aligned for v; v must be a scalar */ #define membuf_store(s, v) \ ({ \ struct membuf *__s = (s); \ if (__s->left) { \ typeof(v) __v = (v); \ size_t __size = sizeof(__v); \ if (unlikely(__size > __s->left)) { \ __size = __s->left; \ memcpy(__s->p, &__v, __size); \ } else { \ *(typeof(__v + 0) *)__s->p = __v; \ } \ __s->p += __size; \ __s->left -= __size; \ } \ __s->left;}) /** * user_regset_active_fn - type of @active function in &struct user_regset * @target: thread being examined * @regset: regset being examined * * Return -%ENODEV if not available on the hardware found. * Return %0 if no interesting state in this thread. * Return >%0 number of @size units of interesting state. * Any get call fetching state beyond that number will * see the default initialization state for this data, * so a caller that knows what the default state is need * not copy it all out. * This call is optional; the pointer is %NULL if there * is no inexpensive check to yield a value < @n. */ typedef int user_regset_active_fn(struct task_struct *target, const struct user_regset *regset); typedef int user_regset_get2_fn(struct task_struct *target, const struct user_regset *regset, struct membuf to); /** * user_regset_set_fn - type of @set function in &struct user_regset * @target: thread being examined * @regset: regset being examined * @pos: offset into the regset data to access, in bytes * @count: amount of data to copy, in bytes * @kbuf: if not %NULL, a kernel-space pointer to copy from * @ubuf: if @kbuf is %NULL, a user-space pointer to copy from * * Store register values. Return %0 on success; -%EIO or -%ENODEV * are usual failure returns. The @pos and @count values are in * bytes, but must be properly aligned. If @kbuf is non-null, that * buffer is used and @ubuf is ignored. If @kbuf is %NULL, then * ubuf gives a userland pointer to access directly, and an -%EFAULT * return value is possible. */ typedef int user_regset_set_fn(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf); /** * user_regset_writeback_fn - type of @writeback function in &struct user_regset * @target: thread being examined * @regset: regset being examined * @immediate: zero if writeback at completion of next context switch is OK * * This call is optional; usually the pointer is %NULL. When * provided, there is some user memory associated with this regset's * hardware, such as memory backing cached register data on register * window machines; the regset's data controls what user memory is * used (e.g. via the stack pointer value). * * Write register data back to user memory. If the @immediate flag * is nonzero, it must be written to the user memory so uaccess or * access_process_vm() can see it when this call returns; if zero, * then it must be written back by the time the task completes a * context switch (as synchronized with wait_task_inactive()). * Return %0 on success or if there was nothing to do, -%EFAULT for * a memory problem (bad stack pointer or whatever), or -%EIO for a * hardware problem. */ typedef int user_regset_writeback_fn(struct task_struct *target, const struct user_regset *regset, int immediate); /** * struct user_regset - accessible thread CPU state * @n: Number of slots (registers). * @size: Size in bytes of a slot (register). * @align: Required alignment, in bytes. * @bias: Bias from natural indexing. * @core_note_type: ELF note @n_type value used in core dumps. * @core_note_name: ELF note name to qualify the note type. * @regset_get: Function to fetch values. * @set: Function to store values. * @active: Function to report if regset is active, or %NULL. * @writeback: Function to write data back to user memory, or %NULL. * * This data structure describes a machine resource we call a register set. * This is part of the state of an individual thread, not necessarily * actual CPU registers per se. A register set consists of a number of * similar slots, given by @n. Each slot is @size bytes, and aligned to * @align bytes (which is at least @size). For dynamically-sized * regsets, @n must contain the maximum possible number of slots for the * regset. * * For backward compatibility, the @get and @set methods must pad to, or * accept, @n * @size bytes, even if the current regset size is smaller. * The precise semantics of these operations depend on the regset being * accessed. * * The functions to which &struct user_regset members point must be * called only on the current thread or on a thread that is in * %TASK_STOPPED or %TASK_TRACED state, that we are guaranteed will not * be woken up and return to user mode, and that we have called * wait_task_inactive() on. (The target thread always might wake up for * SIGKILL while these functions are working, in which case that * thread's user_regset state might be scrambled.) * * The @pos argument must be aligned according to @align; the @count * argument must be a multiple of @size. These functions are not * responsible for checking for invalid arguments. * * When there is a natural value to use as an index, @bias gives the * difference between the natural index and the slot index for the * register set. For example, x86 GDT segment descriptors form a regset; * the segment selector produces a natural index, but only a subset of * that index space is available as a regset (the TLS slots); subtracting * @bias from a segment selector index value computes the regset slot. * * If nonzero, @core_note_type gives the n_type field (NT_* value) * of the core file note in which this regset's data appears. * @core_note_name specifies the note name. The preferred way to * specify these two fields is to use the @USER_REGSET_NOTE_TYPE() * macro. * * NT_PRSTATUS is a special case in that the regset data starts at * offsetof(struct elf_prstatus, pr_reg) into the note data; that is * part of the per-machine ELF formats userland knows about. In * other cases, the core file note contains exactly the whole regset * (@n * @size) and nothing else. The core file note is normally * omitted when there is an @active function and it returns zero. */ struct user_regset { user_regset_get2_fn *regset_get; user_regset_set_fn *set; user_regset_active_fn *active; user_regset_writeback_fn *writeback; unsigned int n; unsigned int size; unsigned int align; unsigned int bias; unsigned int core_note_type; const char *core_note_name; }; #define USER_REGSET_NOTE_TYPE(type) \ .core_note_type = (NT_ ## type), \ .core_note_name = (NN_ ## type) /** * struct user_regset_view - available regsets * @name: Identifier, e.g. UTS_MACHINE string. * @regsets: Array of @n regsets available in this view. * @n: Number of elements in @regsets. * @e_machine: ELF header @e_machine %EM_* value written in core dumps. * @e_flags: ELF header @e_flags value written in core dumps. * @ei_osabi: ELF header @e_ident[%EI_OSABI] value written in core dumps. * * A regset view is a collection of regsets (&struct user_regset, * above). This describes all the state of a thread that can be seen * from a given architecture/ABI environment. More than one view might * refer to the same &struct user_regset, or more than one regset * might refer to the same machine-specific state in the thread. For * example, a 32-bit thread's state could be examined from the 32-bit * view or from the 64-bit view. Either method reaches the same thread * register state, doing appropriate widening or truncation. */ struct user_regset_view { const char *name; const struct user_regset *regsets; unsigned int n; u32 e_flags; u16 e_machine; u8 ei_osabi; }; /* * This is documented here rather than at the definition sites because its * implementation is machine-dependent but its interface is universal. */ /** * task_user_regset_view - Return the process's native regset view. * @tsk: a thread of the process in question * * Return the &struct user_regset_view that is native for the given process. * For example, what it would access when it called ptrace(). * Throughout the life of the process, this only changes at exec. */ const struct user_regset_view *task_user_regset_view(struct task_struct *tsk); static inline int user_regset_copyin(unsigned int *pos, unsigned int *count, const void **kbuf, const void __user **ubuf, void *data, const int start_pos, const int end_pos) { if (*count == 0) return 0; BUG_ON(*pos < start_pos); if (end_pos < 0 || *pos < end_pos) { unsigned int copy = (end_pos < 0 ? *count : min(*count, end_pos - *pos)); data += *pos - start_pos; if (*kbuf) { memcpy(data, *kbuf, copy); *kbuf += copy; } else if (__copy_from_user(data, *ubuf, copy)) return -EFAULT; else *ubuf += copy; *pos += copy; *count -= copy; } return 0; } static inline void user_regset_copyin_ignore(unsigned int *pos, unsigned int *count, const void **kbuf, const void __user **ubuf, const int start_pos, const int end_pos) { if (*count == 0) return; BUG_ON(*pos < start_pos); if (end_pos < 0 || *pos < end_pos) { unsigned int copy = (end_pos < 0 ? *count : min(*count, end_pos - *pos)); if (*kbuf) *kbuf += copy; else *ubuf += copy; *pos += copy; *count -= copy; } } extern int regset_get(struct task_struct *target, const struct user_regset *regset, unsigned int size, void *data); extern int regset_get_alloc(struct task_struct *target, const struct user_regset *regset, unsigned int size, void **data); extern int copy_regset_to_user(struct task_struct *target, const struct user_regset_view *view, unsigned int setno, unsigned int offset, unsigned int size, void __user *data); /** * copy_regset_from_user - store into thread's user_regset data from user memory * @target: thread to be examined * @view: &struct user_regset_view describing user thread machine state * @setno: index in @view->regsets * @offset: offset into the regset data, in bytes * @size: amount of data to copy, in bytes * @data: user-mode pointer to copy from */ static inline int copy_regset_from_user(struct task_struct *target, const struct user_regset_view *view, unsigned int setno, unsigned int offset, unsigned int size, const void __user *data) { const struct user_regset *regset = &view->regsets[setno]; if (!regset->set) return -EOPNOTSUPP; if (!access_ok(data, size)) return -EFAULT; return regset->set(target, regset, offset, size, NULL, data); } #endif /* <linux/regset.h> */
1060 1509 1481 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 /* SPDX-License-Identifier: GPL-2.0 */ /* * Common values and helper functions for the ChaCha and XChaCha stream ciphers. * * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's * security. Here they share the same key size, tfm context, and setkey * function; only their IV size and encrypt/decrypt function differ. * * The ChaCha paper specifies 20, 12, and 8-round variants. In general, it is * recommended to use the 20-round variant ChaCha20. However, the other * variants can be needed in some performance-sensitive scenarios. The generic * ChaCha code currently allows only the 20 and 12-round variants. */ #ifndef _CRYPTO_CHACHA_H #define _CRYPTO_CHACHA_H #include <linux/unaligned.h> #include <linux/string.h> #include <linux/types.h> /* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */ #define CHACHA_IV_SIZE 16 #define CHACHA_KEY_SIZE 32 #define CHACHA_BLOCK_SIZE 64 #define CHACHAPOLY_IV_SIZE 12 #define CHACHA_KEY_WORDS 8 #define CHACHA_STATE_WORDS 16 #define HCHACHA_OUT_WORDS 8 /* 192-bit nonce, then 64-bit stream position */ #define XCHACHA_IV_SIZE 32 struct chacha_state { u32 x[CHACHA_STATE_WORDS]; }; void chacha_block_generic(struct chacha_state *state, u8 out[CHACHA_BLOCK_SIZE], int nrounds); static inline void chacha20_block(struct chacha_state *state, u8 out[CHACHA_BLOCK_SIZE]) { chacha_block_generic(state, out, 20); } void hchacha_block_generic(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); void hchacha_block(const struct chacha_state *state, u32 out[HCHACHA_OUT_WORDS], int nrounds); enum chacha_constants { /* expand 32-byte k */ CHACHA_CONSTANT_EXPA = 0x61707865U, CHACHA_CONSTANT_ND_3 = 0x3320646eU, CHACHA_CONSTANT_2_BY = 0x79622d32U, CHACHA_CONSTANT_TE_K = 0x6b206574U }; static inline void chacha_init_consts(struct chacha_state *state) { state->x[0] = CHACHA_CONSTANT_EXPA; state->x[1] = CHACHA_CONSTANT_ND_3; state->x[2] = CHACHA_CONSTANT_2_BY; state->x[3] = CHACHA_CONSTANT_TE_K; } static inline void chacha_init(struct chacha_state *state, const u32 key[CHACHA_KEY_WORDS], const u8 iv[CHACHA_IV_SIZE]) { chacha_init_consts(state); state->x[4] = key[0]; state->x[5] = key[1]; state->x[6] = key[2]; state->x[7] = key[3]; state->x[8] = key[4]; state->x[9] = key[5]; state->x[10] = key[6]; state->x[11] = key[7]; state->x[12] = get_unaligned_le32(iv + 0); state->x[13] = get_unaligned_le32(iv + 4); state->x[14] = get_unaligned_le32(iv + 8); state->x[15] = get_unaligned_le32(iv + 12); } void chacha_crypt(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds); static inline void chacha20_crypt(struct chacha_state *state, u8 *dst, const u8 *src, unsigned int bytes) { chacha_crypt(state, dst, src, bytes, 20); } static inline void chacha_zeroize_state(struct chacha_state *state) { memzero_explicit(state, sizeof(*state)); } #endif /* _CRYPTO_CHACHA_H */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 // SPDX-License-Identifier: GPL-2.0 /* * support.c - standard functions for the use of pnp protocol drivers * * Copyright 2003 Adam Belay <ambx1@neo.rr.com> * Copyright (C) 2008 Hewlett-Packard Development Company, L.P. * Bjorn Helgaas <bjorn.helgaas@hp.com> */ #include <linux/module.h> #include <linux/ctype.h> #include <linux/pnp.h> #include "base.h" /** * pnp_is_active - Determines if a device is active based on its current * resources * @dev: pointer to the desired PnP device */ int pnp_is_active(struct pnp_dev *dev) { /* * I don't think this is very reliable because pnp_disable_dev() * only clears out auto-assigned resources. */ if (!pnp_port_start(dev, 0) && pnp_port_len(dev, 0) <= 1 && !pnp_mem_start(dev, 0) && pnp_mem_len(dev, 0) <= 1 && pnp_irq(dev, 0) == -1 && pnp_dma(dev, 0) == -1) return 0; else return 1; } EXPORT_SYMBOL(pnp_is_active); /* * Functionally similar to acpi_ex_eisa_id_to_string(), but that's * buried in the ACPI CA, and we can't depend on it being present. */ void pnp_eisa_id_to_string(u32 id, char *str) { id = be32_to_cpu(id); /* * According to the specs, the first three characters are five-bit * compressed ASCII, and the left-over high order bit should be zero. * However, the Linux ISAPNP code historically used six bits for the * first character, and there seem to be IDs that depend on that, * e.g., "nEC8241" in the Linux 8250_pnp serial driver and the * FreeBSD sys/pc98/cbus/sio_cbus.c driver. */ str[0] = 'A' + ((id >> 26) & 0x3f) - 1; str[1] = 'A' + ((id >> 21) & 0x1f) - 1; str[2] = 'A' + ((id >> 16) & 0x1f) - 1; str[3] = hex_asc_hi(id >> 8); str[4] = hex_asc_lo(id >> 8); str[5] = hex_asc_hi(id); str[6] = hex_asc_lo(id); str[7] = '\0'; } char *pnp_resource_type_name(struct resource *res) { switch (pnp_resource_type(res)) { case IORESOURCE_IO: return "io"; case IORESOURCE_MEM: return "mem"; case IORESOURCE_IRQ: return "irq"; case IORESOURCE_DMA: return "dma"; case IORESOURCE_BUS: return "bus"; } return "unknown"; } void dbg_pnp_show_resources(struct pnp_dev *dev, char *desc) { struct pnp_resource *pnp_res; if (list_empty(&dev->resources)) pnp_dbg(&dev->dev, "%s: no current resources\n", desc); else { pnp_dbg(&dev->dev, "%s: current resources:\n", desc); list_for_each_entry(pnp_res, &dev->resources, list) pnp_dbg(&dev->dev, "%pr\n", &pnp_res->res); } } char *pnp_option_priority_name(struct pnp_option *option) { switch (pnp_option_priority(option)) { case PNP_RES_PRIORITY_PREFERRED: return "preferred"; case PNP_RES_PRIORITY_ACCEPTABLE: return "acceptable"; case PNP_RES_PRIORITY_FUNCTIONAL: return "functional"; } return "invalid"; } void dbg_pnp_show_option(struct pnp_dev *dev, struct pnp_option *option) { char buf[128]; int len = 0, i; struct pnp_port *port; struct pnp_mem *mem; struct pnp_irq *irq; struct pnp_dma *dma; if (pnp_option_is_dependent(option)) len += scnprintf(buf + len, sizeof(buf) - len, " dependent set %d (%s) ", pnp_option_set(option), pnp_option_priority_name(option)); else len += scnprintf(buf + len, sizeof(buf) - len, " independent "); switch (option->type) { case IORESOURCE_IO: port = &option->u.port; len += scnprintf(buf + len, sizeof(buf) - len, "io min %#llx " "max %#llx align %lld size %lld flags %#x", (unsigned long long) port->min, (unsigned long long) port->max, (unsigned long long) port->align, (unsigned long long) port->size, port->flags); break; case IORESOURCE_MEM: mem = &option->u.mem; len += scnprintf(buf + len, sizeof(buf) - len, "mem min %#llx " "max %#llx align %lld size %lld flags %#x", (unsigned long long) mem->min, (unsigned long long) mem->max, (unsigned long long) mem->align, (unsigned long long) mem->size, mem->flags); break; case IORESOURCE_IRQ: irq = &option->u.irq; len += scnprintf(buf + len, sizeof(buf) - len, "irq"); if (bitmap_empty(irq->map.bits, PNP_IRQ_NR)) len += scnprintf(buf + len, sizeof(buf) - len, " <none>"); else { for (i = 0; i < PNP_IRQ_NR; i++) if (test_bit(i, irq->map.bits)) len += scnprintf(buf + len, sizeof(buf) - len, " %d", i); } len += scnprintf(buf + len, sizeof(buf) - len, " flags %#x", irq->flags); if (irq->flags & IORESOURCE_IRQ_OPTIONAL) len += scnprintf(buf + len, sizeof(buf) - len, " (optional)"); break; case IORESOURCE_DMA: dma = &option->u.dma; len += scnprintf(buf + len, sizeof(buf) - len, "dma"); if (!dma->map) len += scnprintf(buf + len, sizeof(buf) - len, " <none>"); else { for (i = 0; i < 8; i++) if (dma->map & (1 << i)) len += scnprintf(buf + len, sizeof(buf) - len, " %d", i); } len += scnprintf(buf + len, sizeof(buf) - len, " (bitmask %#x) " "flags %#x", dma->map, dma->flags); break; } pnp_dbg(&dev->dev, "%s\n", buf); }
12 12 12 12 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2009, Oracle. All rights reserved. * * Convert socket addresses to presentation addresses and universal * addresses, and vice versa. * * Universal addresses are introduced by RFC 1833 and further refined by * recent RFCs describing NFSv4. The universal address format is part * of the external (network) interface provided by rpcbind version 3 * and 4, and by NFSv4. Such an address is a string containing a * presentation format IP address followed by a port number in * "hibyte.lobyte" format. * * IPv6 addresses can also include a scope ID, typically denoted by * a '%' followed by a device name or a non-negative integer. Refer to * RFC 4291, Section 2.2 for details on IPv6 presentation formats. */ #include <net/ipv6.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/msg_prot.h> #include <linux/slab.h> #include <linux/export.h> #if IS_ENABLED(CONFIG_IPV6) static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap, char *buf, const int buflen) { const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; const struct in6_addr *addr = &sin6->sin6_addr; /* * RFC 4291, Section 2.2.2 * * Shorthanded ANY address */ if (ipv6_addr_any(addr)) return snprintf(buf, buflen, "::"); /* * RFC 4291, Section 2.2.2 * * Shorthanded loopback address */ if (ipv6_addr_loopback(addr)) return snprintf(buf, buflen, "::1"); /* * RFC 4291, Section 2.2.3 * * Special presentation address format for mapped v4 * addresses. */ if (ipv6_addr_v4mapped(addr)) return snprintf(buf, buflen, "::ffff:%pI4", &addr->s6_addr32[3]); /* * RFC 4291, Section 2.2.1 */ return snprintf(buf, buflen, "%pI6c", addr); } static size_t rpc_ntop6(const struct sockaddr *sap, char *buf, const size_t buflen) { const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; char scopebuf[IPV6_SCOPE_ID_LEN]; size_t len; int rc; len = rpc_ntop6_noscopeid(sap, buf, buflen); if (unlikely(len == 0)) return len; if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) return len; if (sin6->sin6_scope_id == 0) return len; rc = snprintf(scopebuf, sizeof(scopebuf), "%c%u", IPV6_SCOPE_DELIMITER, sin6->sin6_scope_id); if (unlikely((size_t)rc >= sizeof(scopebuf))) return 0; len += rc; if (unlikely(len >= buflen)) return 0; strcat(buf, scopebuf); return len; } #else /* !IS_ENABLED(CONFIG_IPV6) */ static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap, char *buf, const int buflen) { return 0; } static size_t rpc_ntop6(const struct sockaddr *sap, char *buf, const size_t buflen) { return 0; } #endif /* !IS_ENABLED(CONFIG_IPV6) */ static int rpc_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) { const struct sockaddr_in *sin = (struct sockaddr_in *)sap; return snprintf(buf, buflen, "%pI4", &sin->sin_addr); } /** * rpc_ntop - construct a presentation address in @buf * @sap: socket address * @buf: construction area * @buflen: size of @buf, in bytes * * Plants a %NUL-terminated string in @buf and returns the length * of the string, excluding the %NUL. Otherwise zero is returned. */ size_t rpc_ntop(const struct sockaddr *sap, char *buf, const size_t buflen) { switch (sap->sa_family) { case AF_INET: return rpc_ntop4(sap, buf, buflen); case AF_INET6: return rpc_ntop6(sap, buf, buflen); } return 0; } EXPORT_SYMBOL_GPL(rpc_ntop); static size_t rpc_pton4(const char *buf, const size_t buflen, struct sockaddr *sap, const size_t salen) { struct sockaddr_in *sin = (struct sockaddr_in *)sap; u8 *addr = (u8 *)&sin->sin_addr.s_addr; if (buflen > INET_ADDRSTRLEN || salen < sizeof(struct sockaddr_in)) return 0; memset(sap, 0, sizeof(struct sockaddr_in)); if (in4_pton(buf, buflen, addr, '\0', NULL) == 0) return 0; sin->sin_family = AF_INET; return sizeof(struct sockaddr_in); } #if IS_ENABLED(CONFIG_IPV6) static int rpc_parse_scope_id(struct net *net, const char *buf, const size_t buflen, const char *delim, struct sockaddr_in6 *sin6) { char p[IPV6_SCOPE_ID_LEN + 1]; size_t len; u32 scope_id = 0; struct net_device *dev; if ((buf + buflen) == delim) return 1; if (*delim != IPV6_SCOPE_DELIMITER) return 0; if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) return 0; len = (buf + buflen) - delim - 1; if (len > IPV6_SCOPE_ID_LEN) return 0; memcpy(p, delim + 1, len); p[len] = 0; dev = dev_get_by_name(net, p); if (dev != NULL) { scope_id = dev->ifindex; dev_put(dev); } else { if (kstrtou32(p, 10, &scope_id) != 0) return 0; } sin6->sin6_scope_id = scope_id; return 1; } static size_t rpc_pton6(struct net *net, const char *buf, const size_t buflen, struct sockaddr *sap, const size_t salen) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; const char *delim; if (buflen > (INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN) || salen < sizeof(struct sockaddr_in6)) return 0; memset(sap, 0, sizeof(struct sockaddr_in6)); if (in6_pton(buf, buflen, addr, IPV6_SCOPE_DELIMITER, &delim) == 0) return 0; if (!rpc_parse_scope_id(net, buf, buflen, delim, sin6)) return 0; sin6->sin6_family = AF_INET6; return sizeof(struct sockaddr_in6); } #else static size_t rpc_pton6(struct net *net, const char *buf, const size_t buflen, struct sockaddr *sap, const size_t salen) { return 0; } #endif /** * rpc_pton - Construct a sockaddr in @sap * @net: applicable network namespace * @buf: C string containing presentation format IP address * @buflen: length of presentation address in bytes * @sap: buffer into which to plant socket address * @salen: size of buffer in bytes * * Returns the size of the socket address if successful; otherwise * zero is returned. * * Plants a socket address in @sap and returns the size of the * socket address, if successful. Returns zero if an error * occurred. */ size_t rpc_pton(struct net *net, const char *buf, const size_t buflen, struct sockaddr *sap, const size_t salen) { unsigned int i; for (i = 0; i < buflen; i++) if (buf[i] == ':') return rpc_pton6(net, buf, buflen, sap, salen); return rpc_pton4(buf, buflen, sap, salen); } EXPORT_SYMBOL_GPL(rpc_pton); /** * rpc_sockaddr2uaddr - Construct a universal address string from @sap. * @sap: socket address * @gfp_flags: allocation mode * * Returns a %NUL-terminated string in dynamically allocated memory; * otherwise NULL is returned if an error occurred. Caller must * free the returned string. */ char *rpc_sockaddr2uaddr(const struct sockaddr *sap, gfp_t gfp_flags) { char portbuf[RPCBIND_MAXUADDRPLEN]; char addrbuf[RPCBIND_MAXUADDRLEN]; unsigned short port; switch (sap->sa_family) { case AF_INET: if (rpc_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) return NULL; port = ntohs(((struct sockaddr_in *)sap)->sin_port); break; case AF_INET6: if (rpc_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) return NULL; port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); break; default: return NULL; } if (snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff) >= (int)sizeof(portbuf)) return NULL; if (strlcat(addrbuf, portbuf, sizeof(addrbuf)) >= sizeof(addrbuf)) return NULL; return kstrdup(addrbuf, gfp_flags); } /** * rpc_uaddr2sockaddr - convert a universal address to a socket address. * @net: applicable network namespace * @uaddr: C string containing universal address to convert * @uaddr_len: length of universal address string * @sap: buffer into which to plant socket address * @salen: size of buffer * * @uaddr does not have to be '\0'-terminated, but kstrtou8() and * rpc_pton() require proper string termination to be successful. * * Returns the size of the socket address if successful; otherwise * zero is returned. */ size_t rpc_uaddr2sockaddr(struct net *net, const char *uaddr, const size_t uaddr_len, struct sockaddr *sap, const size_t salen) { char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')]; u8 portlo, porthi; unsigned short port; if (uaddr_len > RPCBIND_MAXUADDRLEN) return 0; memcpy(buf, uaddr, uaddr_len); buf[uaddr_len] = '\0'; c = strrchr(buf, '.'); if (unlikely(c == NULL)) return 0; if (unlikely(kstrtou8(c + 1, 10, &portlo) != 0)) return 0; *c = '\0'; c = strrchr(buf, '.'); if (unlikely(c == NULL)) return 0; if (unlikely(kstrtou8(c + 1, 10, &porthi) != 0)) return 0; port = (unsigned short)((porthi << 8) | portlo); *c = '\0'; if (rpc_pton(net, buf, strlen(buf), sap, salen) == 0) return 0; switch (sap->sa_family) { case AF_INET: ((struct sockaddr_in *)sap)->sin_port = htons(port); return sizeof(struct sockaddr_in); case AF_INET6: ((struct sockaddr_in6 *)sap)->sin6_port = htons(port); return sizeof(struct sockaddr_in6); } return 0; } EXPORT_SYMBOL_GPL(rpc_uaddr2sockaddr);
45 45 45 44 45 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 // SPDX-License-Identifier: GPL-2.0+ #include <linux/kernel.h> #include <linux/minmax.h> #include <drm/drm_blend.h> #include <drm/drm_rect.h> #include <drm/drm_fixed.h> #include <kunit/visibility.h> #include "vkms_formats.h" /** * packed_pixels_offset() - Get the offset of the block containing the pixel at coordinates x/y * * @frame_info: Buffer metadata * @x: The x coordinate of the wanted pixel in the buffer * @y: The y coordinate of the wanted pixel in the buffer * @plane_index: The index of the plane to use * @offset: The returned offset inside the buffer of the block * @rem_x: The returned X coordinate of the requested pixel in the block * @rem_y: The returned Y coordinate of the requested pixel in the block * * As some pixel formats store multiple pixels in a block (DRM_FORMAT_R* for example), some * pixels are not individually addressable. This function return 3 values: the offset of the * whole block, and the coordinate of the requested pixel inside this block. * For example, if the format is DRM_FORMAT_R1 and the requested coordinate is 13,5, the offset * will point to the byte 5*pitches + 13/8 (second byte of the 5th line), and the rem_x/rem_y * coordinates will be (13 % 8, 5 % 1) = (5, 0) * * With this function, the caller just have to extract the correct pixel from the block. */ static void packed_pixels_offset(const struct vkms_frame_info *frame_info, int x, int y, int plane_index, int *offset, int *rem_x, int *rem_y) { struct drm_framebuffer *fb = frame_info->fb; const struct drm_format_info *format = frame_info->fb->format; /* Directly using x and y to multiply pitches and format->ccp is not sufficient because * in some formats a block can represent multiple pixels. * * Dividing x and y by the block size allows to extract the correct offset of the block * containing the pixel. */ int block_x = x / drm_format_info_block_width(format, plane_index); int block_y = y / drm_format_info_block_height(format, plane_index); int block_pitch = fb->pitches[plane_index] * drm_format_info_block_height(format, plane_index); *rem_x = x % drm_format_info_block_width(format, plane_index); *rem_y = y % drm_format_info_block_height(format, plane_index); *offset = fb->offsets[plane_index] + block_y * block_pitch + block_x * format->char_per_block[plane_index]; } /** * packed_pixels_addr() - Get the pointer to the block containing the pixel at the given * coordinates * * @frame_info: Buffer metadata * @x: The x (width) coordinate inside the plane * @y: The y (height) coordinate inside the plane * @plane_index: The index of the plane * @addr: The returned pointer * @rem_x: The returned X coordinate of the requested pixel in the block * @rem_y: The returned Y coordinate of the requested pixel in the block * * Takes the information stored in the frame_info, a pair of coordinates, and returns the address * of the block containing this pixel and the pixel position inside this block. * * See @packed_pixels_offset for details about rem_x/rem_y behavior. */ static void packed_pixels_addr(const struct vkms_frame_info *frame_info, int x, int y, int plane_index, u8 **addr, int *rem_x, int *rem_y) { int offset; packed_pixels_offset(frame_info, x, y, plane_index, &offset, rem_x, rem_y); *addr = (u8 *)frame_info->map[0].vaddr + offset; } /** * get_block_step_bytes() - Common helper to compute the correct step value between each pixel block * to read in a certain direction. * * @fb: Framebuffer to iter on * @direction: Direction of the reading * @plane_index: Plane to get the step from * * As the returned count is the number of bytes between two consecutive blocks in a direction, * the caller may have to read multiple pixels before using the next one (for example, to read from * left to right in a DRM_FORMAT_R1 plane, each block contains 8 pixels, so the step must be used * only every 8 pixels). */ static int get_block_step_bytes(struct drm_framebuffer *fb, enum pixel_read_direction direction, int plane_index) { switch (direction) { case READ_LEFT_TO_RIGHT: return fb->format->char_per_block[plane_index]; case READ_RIGHT_TO_LEFT: return -fb->format->char_per_block[plane_index]; case READ_TOP_TO_BOTTOM: return (int)fb->pitches[plane_index] * drm_format_info_block_width(fb->format, plane_index); case READ_BOTTOM_TO_TOP: return -(int)fb->pitches[plane_index] * drm_format_info_block_width(fb->format, plane_index); } return 0; } /** * packed_pixels_addr_1x1() - Get the pointer to the block containing the pixel at the given * coordinates * * @frame_info: Buffer metadata * @x: The x (width) coordinate inside the plane * @y: The y (height) coordinate inside the plane * @plane_index: The index of the plane * @addr: The returned pointer * * This function can only be used with format where block_h == block_w == 1. */ static void packed_pixels_addr_1x1(const struct vkms_frame_info *frame_info, int x, int y, int plane_index, u8 **addr) { int offset, rem_x, rem_y; WARN_ONCE(drm_format_info_block_width(frame_info->fb->format, plane_index) != 1, "%s() only support formats with block_w == 1", __func__); WARN_ONCE(drm_format_info_block_height(frame_info->fb->format, plane_index) != 1, "%s() only support formats with block_h == 1", __func__); packed_pixels_offset(frame_info, x, y, plane_index, &offset, &rem_x, &rem_y); *addr = (u8 *)frame_info->map[0].vaddr + offset; } /** * get_subsampling() - Get the subsampling divisor value on a specific direction * * @format: format to extarct the subsampling from * @direction: direction of the subsampling requested */ static int get_subsampling(const struct drm_format_info *format, enum pixel_read_direction direction) { switch (direction) { case READ_BOTTOM_TO_TOP: case READ_TOP_TO_BOTTOM: return format->vsub; case READ_RIGHT_TO_LEFT: case READ_LEFT_TO_RIGHT: return format->hsub; } WARN_ONCE(true, "Invalid direction for pixel reading: %d\n", direction); return 1; } /** * get_subsampling_offset() - An offset for keeping the chroma siting consistent regardless of * x_start and y_start values * * @direction: direction of the reading to properly compute this offset * @x_start: x coordinate of the starting point of the readed line * @y_start: y coordinate of the starting point of the readed line */ static int get_subsampling_offset(enum pixel_read_direction direction, int x_start, int y_start) { switch (direction) { case READ_BOTTOM_TO_TOP: return -y_start - 1; case READ_TOP_TO_BOTTOM: return y_start; case READ_RIGHT_TO_LEFT: return -x_start - 1; case READ_LEFT_TO_RIGHT: return x_start; } WARN_ONCE(true, "Invalid direction for pixel reading: %d\n", direction); return 0; } /* * The following functions take pixel data (a, r, g, b, pixel, ...) and convert them to * &struct pixel_argb_u16 * * They are used in the `read_line`s functions to avoid duplicate work for some pixel formats. */ static struct pixel_argb_u16 argb_u16_from_u8888(u8 a, u8 r, u8 g, u8 b) { struct pixel_argb_u16 out_pixel; /* * The 257 is the "conversion ratio". This number is obtained by the * (2^16 - 1) / (2^8 - 1) division. Which, in this case, tries to get * the best color value in a pixel format with more possibilities. * A similar idea applies to others RGB color conversions. */ out_pixel.a = (u16)a * 257; out_pixel.r = (u16)r * 257; out_pixel.g = (u16)g * 257; out_pixel.b = (u16)b * 257; return out_pixel; } static struct pixel_argb_u16 argb_u16_from_u16161616(u16 a, u16 r, u16 g, u16 b) { struct pixel_argb_u16 out_pixel; out_pixel.a = a; out_pixel.r = r; out_pixel.g = g; out_pixel.b = b; return out_pixel; } static struct pixel_argb_u16 argb_u16_from_le16161616(__le16 a, __le16 r, __le16 g, __le16 b) { return argb_u16_from_u16161616(le16_to_cpu(a), le16_to_cpu(r), le16_to_cpu(g), le16_to_cpu(b)); } static struct pixel_argb_u16 argb_u16_from_RGB565(const __le16 *pixel) { struct pixel_argb_u16 out_pixel; s64 fp_rb_ratio = drm_fixp_div(drm_int2fixp(65535), drm_int2fixp(31)); s64 fp_g_ratio = drm_fixp_div(drm_int2fixp(65535), drm_int2fixp(63)); u16 rgb_565 = le16_to_cpu(*pixel); s64 fp_r = drm_int2fixp((rgb_565 >> 11) & 0x1f); s64 fp_g = drm_int2fixp((rgb_565 >> 5) & 0x3f); s64 fp_b = drm_int2fixp(rgb_565 & 0x1f); out_pixel.a = (u16)0xffff; out_pixel.r = drm_fixp2int_round(drm_fixp_mul(fp_r, fp_rb_ratio)); out_pixel.g = drm_fixp2int_round(drm_fixp_mul(fp_g, fp_g_ratio)); out_pixel.b = drm_fixp2int_round(drm_fixp_mul(fp_b, fp_rb_ratio)); return out_pixel; } static struct pixel_argb_u16 argb_u16_from_gray8(u8 gray) { return argb_u16_from_u8888(255, gray, gray, gray); } static struct pixel_argb_u16 argb_u16_from_grayu16(u16 gray) { return argb_u16_from_u16161616(0xFFFF, gray, gray, gray); } static struct pixel_argb_u16 argb_u16_from_BGR565(const __le16 *pixel) { struct pixel_argb_u16 out_pixel; out_pixel = argb_u16_from_RGB565(pixel); swap(out_pixel.r, out_pixel.b); return out_pixel; } VISIBLE_IF_KUNIT struct pixel_argb_u16 argb_u16_from_yuv161616(const struct conversion_matrix *matrix, u16 y, u16 channel_1, u16 channel_2) { u16 r, g, b; s64 fp_y, fp_channel_1, fp_channel_2; s64 fp_r, fp_g, fp_b; fp_y = drm_int2fixp((int)y - matrix->y_offset * 257); fp_channel_1 = drm_int2fixp((int)channel_1 - 128 * 257); fp_channel_2 = drm_int2fixp((int)channel_2 - 128 * 257); fp_r = drm_fixp_mul(matrix->matrix[0][0], fp_y) + drm_fixp_mul(matrix->matrix[0][1], fp_channel_1) + drm_fixp_mul(matrix->matrix[0][2], fp_channel_2); fp_g = drm_fixp_mul(matrix->matrix[1][0], fp_y) + drm_fixp_mul(matrix->matrix[1][1], fp_channel_1) + drm_fixp_mul(matrix->matrix[1][2], fp_channel_2); fp_b = drm_fixp_mul(matrix->matrix[2][0], fp_y) + drm_fixp_mul(matrix->matrix[2][1], fp_channel_1) + drm_fixp_mul(matrix->matrix[2][2], fp_channel_2); fp_r = drm_fixp2int_round(fp_r); fp_g = drm_fixp2int_round(fp_g); fp_b = drm_fixp2int_round(fp_b); r = clamp(fp_r, 0, 0xffff); g = clamp(fp_g, 0, 0xffff); b = clamp(fp_b, 0, 0xffff); return argb_u16_from_u16161616(0xffff, r, g, b); } EXPORT_SYMBOL_IF_KUNIT(argb_u16_from_yuv161616); /** * READ_LINE() - Generic generator for a read_line function which can be used for format with one * plane and a block_h == block_w == 1. * * @function_name: Function name to generate * @pixel_name: Temporary pixel name used in the @__VA_ARGS__ parameters * @pixel_type: Used to specify the type you want to cast the pixel pointer * @callback: Callback to call for each pixels. This fonction should take @__VA_ARGS__ as parameter * and return a pixel_argb_u16 * __VA_ARGS__: Argument to pass inside the callback. You can use @pixel_name to access current * pixel. */ #define READ_LINE(function_name, pixel_name, pixel_type, callback, ...) \ static void function_name(const struct vkms_plane_state *plane, int x_start, \ int y_start, enum pixel_read_direction direction, int count, \ struct pixel_argb_u16 out_pixel[]) \ { \ struct pixel_argb_u16 *end = out_pixel + count; \ int step = get_block_step_bytes(plane->frame_info->fb, direction, 0); \ u8 *src_pixels; \ \ packed_pixels_addr_1x1(plane->frame_info, x_start, y_start, 0, &src_pixels); \ \ while (out_pixel < end) { \ pixel_type *(pixel_name) = (pixel_type *)src_pixels; \ *out_pixel = (callback)(__VA_ARGS__); \ out_pixel += 1; \ src_pixels += step; \ } \ } /** * READ_LINE_ARGB8888() - Generic generator for ARGB8888 formats. * The pixel type used is u8, so pixel_name[0]..pixel_name[n] are the n components of the pixel. * * @function_name: Function name to generate * @pixel_name: temporary pixel to use in @a, @r, @g and @b parameters * @a: alpha value * @r: red value * @g: green value * @b: blue value */ #define READ_LINE_ARGB8888(function_name, pixel_name, a, r, g, b) \ READ_LINE(function_name, pixel_name, u8, argb_u16_from_u8888, a, r, g, b) /** * READ_LINE_le16161616() - Generic generator for ARGB16161616 formats. * The pixel type used is u16, so pixel_name[0]..pixel_name[n] are the n components of the pixel. * * @function_name: Function name to generate * @pixel_name: temporary pixel to use in @a, @r, @g and @b parameters * @a: alpha value * @r: red value * @g: green value * @b: blue value */ #define READ_LINE_le16161616(function_name, pixel_name, a, r, g, b) \ READ_LINE(function_name, pixel_name, __le16, argb_u16_from_le16161616, a, r, g, b) /* * The following functions are read_line function for each pixel format supported by VKMS. * * They read a line starting at the point @x_start,@y_start following the @direction. The result * is stored in @out_pixel and in a 64 bits format, see struct pixel_argb_u16. * * These functions are very repetitive, but the innermost pixel loops must be kept inside these * functions for performance reasons. Some benchmarking was done in [1] where having the innermost * loop factored out of these functions showed a slowdown by a factor of three. * * [1]: https://lore.kernel.org/dri-devel/d258c8dc-78e9-4509-9037-a98f7f33b3a3@riseup.net/ */ static void Rx_read_line(const struct vkms_plane_state *plane, int x_start, int y_start, enum pixel_read_direction direction, int count, struct pixel_argb_u16 out_pixel[]) { struct pixel_argb_u16 *end = out_pixel + count; int bits_per_pixel = drm_format_info_bpp(plane->frame_info->fb->format, 0); u8 *src_pixels; int rem_x, rem_y; WARN_ONCE(drm_format_info_block_height(plane->frame_info->fb->format, 0) != 1, "%s() only support formats with block_h == 1", __func__); packed_pixels_addr(plane->frame_info, x_start, y_start, 0, &src_pixels, &rem_x, &rem_y); int bit_offset = (8 - bits_per_pixel) - rem_x * bits_per_pixel; int step = get_block_step_bytes(plane->frame_info->fb, direction, 0); int mask = (0x1 << bits_per_pixel) - 1; int lum_per_level = 0xFFFF / mask; if (direction == READ_LEFT_TO_RIGHT || direction == READ_RIGHT_TO_LEFT) { int restart_bit_offset; int step_bit_offset; if (direction == READ_LEFT_TO_RIGHT) { restart_bit_offset = 8 - bits_per_pixel; step_bit_offset = -bits_per_pixel; } else { restart_bit_offset = 0; step_bit_offset = bits_per_pixel; } while (out_pixel < end) { u8 val = ((*src_pixels) >> bit_offset) & mask; *out_pixel = argb_u16_from_grayu16((int)val * lum_per_level); bit_offset += step_bit_offset; if (bit_offset < 0 || 8 <= bit_offset) { bit_offset = restart_bit_offset; src_pixels += step; } out_pixel += 1; } } else if (direction == READ_TOP_TO_BOTTOM || direction == READ_BOTTOM_TO_TOP) { while (out_pixel < end) { u8 val = (*src_pixels >> bit_offset) & mask; *out_pixel = argb_u16_from_grayu16((int)val * lum_per_level); src_pixels += step; out_pixel += 1; } } } static void R1_read_line(const struct vkms_plane_state *plane, int x_start, int y_start, enum pixel_read_direction direction, int count, struct pixel_argb_u16 out_pixel[]) { Rx_read_line(plane, x_start, y_start, direction, count, out_pixel); } static void R2_read_line(const struct vkms_plane_state *plane, int x_start, int y_start, enum pixel_read_direction direction, int count, struct pixel_argb_u16 out_pixel[]) { Rx_read_line(plane, x_start, y_start, direction, count, out_pixel); } static void R4_read_line(const struct vkms_plane_state *plane, int x_start, int y_start, enum pixel_read_direction direction, int count, struct pixel_argb_u16 out_pixel[]) { Rx_read_line(plane, x_start, y_start, direction, count, out_pixel); } READ_LINE_ARGB8888(XRGB8888_read_line, px, 0xFF, px[2], px[1], px[0]) READ_LINE_ARGB8888(XBGR8888_read_line, px, 0xFF, px[0], px[1], px[2]) READ_LINE_ARGB8888(ARGB8888_read_line, px, px[3], px[2], px[1], px[0]) READ_LINE_ARGB8888(ABGR8888_read_line, px, px[3], px[0], px[1], px[2]) READ_LINE_ARGB8888(RGBA8888_read_line, px, px[0], px[3], px[2], px[1]) READ_LINE_ARGB8888(BGRA8888_read_line, px, px[0], px[1], px[2], px[3]) READ_LINE_ARGB8888(RGB888_read_line, px, 0xFF, px[2], px[1], px[0]) READ_LINE_ARGB8888(BGR888_read_line, px, 0xFF, px[0], px[1], px[2]) READ_LINE_le16161616(ARGB16161616_read_line, px, px[3], px[2], px[1], px[0]) READ_LINE_le16161616(ABGR16161616_read_line, px, px[3], px[0], px[1], px[2]) READ_LINE_le16161616(XRGB16161616_read_line, px, cpu_to_le16(0xFFFF), px[2], px[1], px[0]) READ_LINE_le16161616(XBGR16161616_read_line, px, cpu_to_le16(0xFFFF), px[0], px[1], px[2]) READ_LINE(RGB565_read_line, px, __le16, argb_u16_from_RGB565, px) READ_LINE(BGR565_read_line, px, __le16, argb_u16_from_BGR565, px) READ_LINE(R8_read_line, px, u8, argb_u16_from_gray8, *px) /* * This callback can be used for YUV formats where U and V values are * stored in the same plane (often called semi-planar formats). It will * correctly handle subsampling as described in the drm_format_info of the plane. * * The conversion matrix stored in the @plane is used to: * - Apply the correct color range and encoding * - Convert YUV and YVU with the same function (a column swap is needed when setting up * plane->conversion_matrix) */ /** * READ_LINE_YUV_SEMIPLANAR() - Generic generator for a read_line function which can be used for yuv * formats with two planes and block_w == block_h == 1. * * @function_name: Function name to generate * @pixel_1_name: temporary pixel name for the first plane used in the @__VA_ARGS__ parameters * @pixel_2_name: temporary pixel name for the second plane used in the @__VA_ARGS__ parameters * @pixel_1_type: Used to specify the type you want to cast the pixel pointer on the plane 1 * @pixel_2_type: Used to specify the type you want to cast the pixel pointer on the plane 2 * @callback: Callback to call for each pixels. This function should take * (struct conversion_matrix*, @__VA_ARGS__) as parameter and return a pixel_argb_u16 * __VA_ARGS__: Argument to pass inside the callback. You can use @pixel_1_name and @pixel_2_name * to access current pixel values */ #define READ_LINE_YUV_SEMIPLANAR(function_name, pixel_1_name, pixel_2_name, pixel_1_type, \ pixel_2_type, callback, ...) \ static void function_name(const struct vkms_plane_state *plane, int x_start, \ int y_start, enum pixel_read_direction direction, int count, \ struct pixel_argb_u16 out_pixel[]) \ { \ u8 *plane_1; \ u8 *plane_2; \ \ packed_pixels_addr_1x1(plane->frame_info, x_start, y_start, 0, \ &plane_1); \ packed_pixels_addr_1x1(plane->frame_info, \ x_start / plane->frame_info->fb->format->hsub, \ y_start / plane->frame_info->fb->format->vsub, 1, \ &plane_2); \ int step_1 = get_block_step_bytes(plane->frame_info->fb, direction, 0); \ int step_2 = get_block_step_bytes(plane->frame_info->fb, direction, 1); \ int subsampling = get_subsampling(plane->frame_info->fb->format, direction); \ int subsampling_offset = get_subsampling_offset(direction, x_start, y_start); \ const struct conversion_matrix *conversion_matrix = &plane->conversion_matrix; \ \ for (int i = 0; i < count; i++) { \ pixel_1_type *(pixel_1_name) = (pixel_1_type *)plane_1; \ pixel_2_type *(pixel_2_name) = (pixel_2_type *)plane_2; \ *out_pixel = (callback)(conversion_matrix, __VA_ARGS__); \ out_pixel += 1; \ plane_1 += step_1; \ if ((i + subsampling_offset + 1) % subsampling == 0) \ plane_2 += step_2; \ } \ } READ_LINE_YUV_SEMIPLANAR(YUV888_semiplanar_read_line, y, uv, u8, u8, argb_u16_from_yuv161616, y[0] * 257, uv[0] * 257, uv[1] * 257) READ_LINE_YUV_SEMIPLANAR(YUV161616_semiplanar_read_line, y, uv, u16, u16, argb_u16_from_yuv161616, y[0], uv[0], uv[1]) /* * This callback can be used for YUV format where each color component is * stored in a different plane (often called planar formats). It will * correctly handle subsampling as described in the drm_format_info of the plane. * * The conversion matrix stored in the @plane is used to: * - Apply the correct color range and encoding * - Convert YUV and YVU with the same function (a column swap is needed when setting up * plane->conversion_matrix) */ static void planar_yuv_read_line(const struct vkms_plane_state *plane, int x_start, int y_start, enum pixel_read_direction direction, int count, struct pixel_argb_u16 out_pixel[]) { u8 *y_plane; u8 *channel_1_plane; u8 *channel_2_plane; packed_pixels_addr_1x1(plane->frame_info, x_start, y_start, 0, &y_plane); packed_pixels_addr_1x1(plane->frame_info, x_start / plane->frame_info->fb->format->hsub, y_start / plane->frame_info->fb->format->vsub, 1, &channel_1_plane); packed_pixels_addr_1x1(plane->frame_info, x_start / plane->frame_info->fb->format->hsub, y_start / plane->frame_info->fb->format->vsub, 2, &channel_2_plane); int step_y = get_block_step_bytes(plane->frame_info->fb, direction, 0); int step_channel_1 = get_block_step_bytes(plane->frame_info->fb, direction, 1); int step_channel_2 = get_block_step_bytes(plane->frame_info->fb, direction, 2); int subsampling = get_subsampling(plane->frame_info->fb->format, direction); int subsampling_offset = get_subsampling_offset(direction, x_start, y_start); const struct conversion_matrix *conversion_matrix = &plane->conversion_matrix; for (int i = 0; i < count; i++) { *out_pixel = argb_u16_from_yuv161616(conversion_matrix, *y_plane * 257, *channel_1_plane * 257, *channel_2_plane * 257); out_pixel += 1; y_plane += step_y; if ((i + subsampling_offset + 1) % subsampling == 0) { channel_1_plane += step_channel_1; channel_2_plane += step_channel_2; } } } /* * The following functions take one &struct pixel_argb_u16 and convert it to a specific format. * The result is stored in @out_pixel. * * They are used in vkms_writeback_row() to convert and store a pixel from the src_buffer to * the writeback buffer. */ static void argb_u16_to_ARGB8888(u8 *out_pixel, const struct pixel_argb_u16 *in_pixel) { /* * This sequence below is important because the format's byte order is * in little-endian. In the case of the ARGB8888 the memory is * organized this way: * * | Addr | = blue channel * | Addr + 1 | = green channel * | Addr + 2 | = Red channel * | Addr + 3 | = Alpha channel */ out_pixel[3] = DIV_ROUND_CLOSEST(in_pixel->a, 257); out_pixel[2] = DIV_ROUND_CLOSEST(in_pixel->r, 257); out_pixel[1] = DIV_ROUND_CLOSEST(in_pixel->g, 257); out_pixel[0] = DIV_ROUND_CLOSEST(in_pixel->b, 257); } static void argb_u16_to_XRGB8888(u8 *out_pixel, const struct pixel_argb_u16 *in_pixel) { out_pixel[3] = 0xff; out_pixel[2] = DIV_ROUND_CLOSEST(in_pixel->r, 257); out_pixel[1] = DIV_ROUND_CLOSEST(in_pixel->g, 257); out_pixel[0] = DIV_ROUND_CLOSEST(in_pixel->b, 257); } static void argb_u16_to_ABGR8888(u8 *out_pixel, const struct pixel_argb_u16 *in_pixel) { out_pixel[3] = DIV_ROUND_CLOSEST(in_pixel->a, 257); out_pixel[2] = DIV_ROUND_CLOSEST(in_pixel->b, 257); out_pixel[1] = DIV_ROUND_CLOSEST(in_pixel->g, 257); out_pixel[0] = DIV_ROUND_CLOSEST(in_pixel->r, 257); } static void argb_u16_to_ARGB16161616(u8 *out_pixel, const struct pixel_argb_u16 *in_pixel) { __le16 *pixel = (__le16 *)out_pixel; pixel[3] = cpu_to_le16(in_pixel->a); pixel[2] = cpu_to_le16(in_pixel->r); pixel[1] = cpu_to_le16(in_pixel->g); pixel[0] = cpu_to_le16(in_pixel->b); } static void argb_u16_to_XRGB16161616(u8 *out_pixel, const struct pixel_argb_u16 *in_pixel) { __le16 *pixel = (__le16 *)out_pixel; pixel[3] = cpu_to_le16(0xffff); pixel[2] = cpu_to_le16(in_pixel->r); pixel[1] = cpu_to_le16(in_pixel->g); pixel[0] = cpu_to_le16(in_pixel->b); } static void argb_u16_to_RGB565(u8 *out_pixel, const struct pixel_argb_u16 *in_pixel) { __le16 *pixel = (__le16 *)out_pixel; s64 fp_rb_ratio = drm_fixp_div(drm_int2fixp(65535), drm_int2fixp(31)); s64 fp_g_ratio = drm_fixp_div(drm_int2fixp(65535), drm_int2fixp(63)); s64 fp_r = drm_int2fixp(in_pixel->r); s64 fp_g = drm_int2fixp(in_pixel->g); s64 fp_b = drm_int2fixp(in_pixel->b); u16 r = drm_fixp2int(drm_fixp_div(fp_r, fp_rb_ratio)); u16 g = drm_fixp2int(drm_fixp_div(fp_g, fp_g_ratio)); u16 b = drm_fixp2int(drm_fixp_div(fp_b, fp_rb_ratio)); *pixel = cpu_to_le16(r << 11 | g << 5 | b); } /** * vkms_writeback_row() - Generic loop for all supported writeback format. It is executed just * after the blending to write a line in the writeback buffer. * * @wb: Job where to insert the final image * @src_buffer: Line to write * @y: Row to write in the writeback buffer */ void vkms_writeback_row(struct vkms_writeback_job *wb, const struct line_buffer *src_buffer, int y) { struct vkms_frame_info *frame_info = &wb->wb_frame_info; int x_dst = frame_info->dst.x1; u8 *dst_pixels; int rem_x, rem_y; packed_pixels_addr(frame_info, x_dst, y, 0, &dst_pixels, &rem_x, &rem_y); struct pixel_argb_u16 *in_pixels = src_buffer->pixels; int x_limit = min_t(size_t, drm_rect_width(&frame_info->dst), src_buffer->n_pixels); for (size_t x = 0; x < x_limit; x++, dst_pixels += frame_info->fb->format->cpp[0]) wb->pixel_write(dst_pixels, &in_pixels[x]); } /** * get_pixel_read_line_function() - Retrieve the correct read_line function for a specific * format. The returned pointer is NULL for unsupported pixel formats. The caller must ensure that * the pointer is valid before using it in a vkms_plane_state. * * @format: DRM_FORMAT_* value for which to obtain a conversion function (see [drm_fourcc.h]) */ pixel_read_line_t get_pixel_read_line_function(u32 format) { switch (format) { case DRM_FORMAT_ARGB8888: return &ARGB8888_read_line; case DRM_FORMAT_ABGR8888: return &ABGR8888_read_line; case DRM_FORMAT_BGRA8888: return &BGRA8888_read_line; case DRM_FORMAT_RGBA8888: return &RGBA8888_read_line; case DRM_FORMAT_XRGB8888: return &XRGB8888_read_line; case DRM_FORMAT_XBGR8888: return &XBGR8888_read_line; case DRM_FORMAT_RGB888: return &RGB888_read_line; case DRM_FORMAT_BGR888: return &BGR888_read_line; case DRM_FORMAT_ARGB16161616: return &ARGB16161616_read_line; case DRM_FORMAT_ABGR16161616: return &ABGR16161616_read_line; case DRM_FORMAT_XRGB16161616: return &XRGB16161616_read_line; case DRM_FORMAT_XBGR16161616: return &XBGR16161616_read_line; case DRM_FORMAT_RGB565: return &RGB565_read_line; case DRM_FORMAT_BGR565: return &BGR565_read_line; case DRM_FORMAT_NV12: case DRM_FORMAT_NV16: case DRM_FORMAT_NV24: case DRM_FORMAT_NV21: case DRM_FORMAT_NV61: case DRM_FORMAT_NV42: return &YUV888_semiplanar_read_line; case DRM_FORMAT_P010: case DRM_FORMAT_P012: case DRM_FORMAT_P016: return &YUV161616_semiplanar_read_line; case DRM_FORMAT_YUV420: case DRM_FORMAT_YUV422: case DRM_FORMAT_YUV444: case DRM_FORMAT_YVU420: case DRM_FORMAT_YVU422: case DRM_FORMAT_YVU444: return &planar_yuv_read_line; case DRM_FORMAT_R1: return &R1_read_line; case DRM_FORMAT_R2: return &R2_read_line; case DRM_FORMAT_R4: return &R4_read_line; case DRM_FORMAT_R8: return &R8_read_line; default: /* * This is a bug in vkms_plane_atomic_check(). All the supported * format must: * - Be listed in vkms_formats in vkms_plane.c * - Have a pixel_read callback defined here */ pr_err("Pixel format %p4cc is not supported by VKMS planes. This is a kernel bug, atomic check must forbid this configuration.\n", &format); BUG(); } } /* * Those matrices were generated using the colour python framework * * Below are the function calls used to generate each matrix, go to * https://colour.readthedocs.io/en/develop/generated/colour.matrix_YCbCr.html * for more info: * * numpy.around(colour.matrix_YCbCr(K=colour.WEIGHTS_YCBCR["ITU-R BT.601"], * is_legal = False, * bits = 8) * 2**32).astype(int) */ static const struct conversion_matrix no_operation = { .matrix = { { 4294967296, 0, 0, }, { 0, 4294967296, 0, }, { 0, 0, 4294967296, }, }, .y_offset = 0, }; static const struct conversion_matrix yuv_bt601_full = { .matrix = { { 4294967296, 0, 6021544149 }, { 4294967296, -1478054095, -3067191994 }, { 4294967296, 7610682049, 0 }, }, .y_offset = 0, }; /* * numpy.around(colour.matrix_YCbCr(K=colour.WEIGHTS_YCBCR["ITU-R BT.601"], * is_legal = True, * bits = 8) * 2**32).astype(int) */ static const struct conversion_matrix yuv_bt601_limited = { .matrix = { { 5020601039, 0, 6881764740 }, { 5020601039, -1689204679, -3505362278 }, { 5020601039, 8697922339, 0 }, }, .y_offset = 16, }; /* * numpy.around(colour.matrix_YCbCr(K=colour.WEIGHTS_YCBCR["ITU-R BT.709"], * is_legal = False, * bits = 8) * 2**32).astype(int) */ static const struct conversion_matrix yuv_bt709_full = { .matrix = { { 4294967296, 0, 6763714498 }, { 4294967296, -804551626, -2010578443 }, { 4294967296, 7969741314, 0 }, }, .y_offset = 0, }; /* * numpy.around(colour.matrix_YCbCr(K=colour.WEIGHTS_YCBCR["ITU-R BT.709"], * is_legal = True, * bits = 8) * 2**32).astype(int) */ static const struct conversion_matrix yuv_bt709_limited = { .matrix = { { 5020601039, 0, 7729959424 }, { 5020601039, -919487572, -2297803934 }, { 5020601039, 9108275786, 0 }, }, .y_offset = 16, }; /* * numpy.around(colour.matrix_YCbCr(K=colour.WEIGHTS_YCBCR["ITU-R BT.2020"], * is_legal = False, * bits = 8) * 2**32).astype(int) */ static const struct conversion_matrix yuv_bt2020_full = { .matrix = { { 4294967296, 0, 6333358775 }, { 4294967296, -706750298, -2453942994 }, { 4294967296, 8080551471, 0 }, }, .y_offset = 0, }; /* * numpy.around(colour.matrix_YCbCr(K=colour.WEIGHTS_YCBCR["ITU-R BT.2020"], * is_legal = True, * bits = 8) * 2**32).astype(int) */ static const struct conversion_matrix yuv_bt2020_limited = { .matrix = { { 5020601039, 0, 7238124312 }, { 5020601039, -807714626, -2804506279 }, { 5020601039, 9234915964, 0 }, }, .y_offset = 16, }; /** * swap_uv_columns() - Swap u and v column of a given matrix * * @matrix: Matrix in which column are swapped */ static void swap_uv_columns(struct conversion_matrix *matrix) { swap(matrix->matrix[0][2], matrix->matrix[0][1]); swap(matrix->matrix[1][2], matrix->matrix[1][1]); swap(matrix->matrix[2][2], matrix->matrix[2][1]); } /** * get_conversion_matrix_to_argb_u16() - Retrieve the correct yuv to rgb conversion matrix for a * given encoding and range. * * @format: DRM_FORMAT_* value for which to obtain a conversion function (see [drm_fourcc.h]) * @encoding: DRM_COLOR_* value for which to obtain a conversion matrix * @range: DRM_COLOR_*_RANGE value for which to obtain a conversion matrix * @matrix: Pointer to store the value into */ void get_conversion_matrix_to_argb_u16(u32 format, enum drm_color_encoding encoding, enum drm_color_range range, struct conversion_matrix *matrix) { const struct conversion_matrix *matrix_to_copy; bool limited_range; switch (range) { case DRM_COLOR_YCBCR_LIMITED_RANGE: limited_range = true; break; case DRM_COLOR_YCBCR_FULL_RANGE: limited_range = false; break; case DRM_COLOR_RANGE_MAX: limited_range = false; WARN_ONCE(true, "The requested range is not supported."); break; } switch (encoding) { case DRM_COLOR_YCBCR_BT601: matrix_to_copy = limited_range ? &yuv_bt601_limited : &yuv_bt601_full; break; case DRM_COLOR_YCBCR_BT709: matrix_to_copy = limited_range ? &yuv_bt709_limited : &yuv_bt709_full; break; case DRM_COLOR_YCBCR_BT2020: matrix_to_copy = limited_range ? &yuv_bt2020_limited : &yuv_bt2020_full; break; case DRM_COLOR_ENCODING_MAX: matrix_to_copy = &no_operation; WARN_ONCE(true, "The requested encoding is not supported."); break; } memcpy(matrix, matrix_to_copy, sizeof(*matrix_to_copy)); switch (format) { case DRM_FORMAT_YVU420: case DRM_FORMAT_YVU422: case DRM_FORMAT_YVU444: case DRM_FORMAT_NV21: case DRM_FORMAT_NV61: case DRM_FORMAT_NV42: swap_uv_columns(matrix); break; default: break; } } EXPORT_SYMBOL(get_conversion_matrix_to_argb_u16); /** * get_pixel_write_function() - Retrieve the correct write_pixel function for a specific format. * The returned pointer is NULL for unsupported pixel formats. The caller must ensure that the * pointer is valid before using it in a vkms_writeback_job. * * @format: DRM_FORMAT_* value for which to obtain a conversion function (see [drm_fourcc.h]) */ pixel_write_t get_pixel_write_function(u32 format) { switch (format) { case DRM_FORMAT_ARGB8888: return &argb_u16_to_ARGB8888; case DRM_FORMAT_XRGB8888: return &argb_u16_to_XRGB8888; case DRM_FORMAT_ABGR8888: return &argb_u16_to_ABGR8888; case DRM_FORMAT_ARGB16161616: return &argb_u16_to_ARGB16161616; case DRM_FORMAT_XRGB16161616: return &argb_u16_to_XRGB16161616; case DRM_FORMAT_RGB565: return &argb_u16_to_RGB565; default: /* * This is a bug in vkms_writeback_atomic_check. All the supported * format must: * - Be listed in vkms_wb_formats in vkms_writeback.c * - Have a pixel_write callback defined here */ pr_err("Pixel format %p4cc is not supported by VKMS writeback. This is a kernel bug, atomic check must forbid this configuration.\n", &format); BUG(); } }
44 69 49 49 49 23 23 22 22 22 22 23 23 23 23 27 19 27 1 1 125 125 123 123 123 1 123 1 123 123 123 116 31 30 31 31 120 123 22 22 22 5 2 3 2 1 1 1 1 5 4 1 3 2 4 5 5 2 4 1 1 6 1 1 58 56 3 2 2 2 4 2 2 4 2 4 4 4 3 3 3 3 33 22 22 22 22 22 22 22 22 22 22 22 22 33 11 3 9 6 11 5 29 4 28 9 27 1 10 1 1 26 29 29 29 31 30 28 29 29 29 29 29 29 29 27 12 5 5 19 19 19 19 12 14 14 14 14 14 14 1 1 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 2 8 2 8 6 3 2 7 6 5 4 2 3 2 2 2 7 2 2 2 1 1 2 2 7 7 5 5 7 7 3 2 3 3 2 3 3 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/jhash.h> #include <linux/delay.h> #include <linux/time.h> #include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/ethtool.h> #include <linux/wait.h> #include <asm/div64.h> #include <linux/highmem.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/inetdevice.h> #include <linux/list.h> #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> #include <net/genetlink.h> #include <net/gso.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/pkt_cls.h> #include "datapath.h" #include "drop.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" #include "meter.h" #include "openvswitch_trace.h" #include "vport-internal_dev.h" #include "vport-netdev.h" unsigned int ovs_net_id __read_mostly; static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; static struct genl_family dp_datapath_genl_family; static const struct nla_policy flow_policy[]; static const struct genl_multicast_group ovs_dp_flow_multicast_group = { .name = OVS_FLOW_MCGROUP, }; static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { .name = OVS_DATAPATH_MCGROUP, }; static const struct genl_multicast_group ovs_dp_vport_multicast_group = { .name = OVS_VPORT_MCGROUP, }; /* Check if need to build a reply message. * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, unsigned int group) { return info->nlhdr->nlmsg_flags & NLM_F_ECHO || genl_has_listeners(family, genl_info_net(info), group); } static void ovs_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info) { genl_notify(family, skb, info, 0, GFP_KERNEL); } /** * DOC: Locking: * * All writes e.g. Writes to device state (add/remove datapath, port, set * operations on vports, etc.), Writes to other state (flow table * modifications, set miscellaneous datapath parameters, etc.) are protected * by ovs_lock. * * Reads are protected by RCU. * * There are a few special cases (mostly stats) that have their own * synchronization but they nest under all of above and don't interact with * each other. * * The RTNL lock nests inside ovs_mutex. */ static DEFINE_MUTEX(ovs_mutex); void ovs_lock(void) { mutex_lock(&ovs_mutex); } void ovs_unlock(void) { mutex_unlock(&ovs_mutex); } #ifdef CONFIG_LOCKDEP int lockdep_ovsl_is_held(void) { if (debug_locks) return lockdep_is_held(&ovs_mutex); else return 1; } #endif static struct vport *new_vport(const struct vport_parms *); static int queue_gso_packets(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); static void ovs_dp_masks_rebalance(struct work_struct *work); static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) { struct vport *local; int ifindex; rcu_read_lock(); local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) ifindex = local->dev->ifindex; else ifindex = 0; rcu_read_unlock(); return ifindex; } static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); kfree(dp->ports); ovs_meters_exit(dp); kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } static struct hlist_head *vport_hash_bucket(const struct datapath *dp, u16 port_no) { return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; } /* Called with ovs_mutex or RCU read lock. */ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) { struct vport *vport; struct hlist_head *head; head = vport_hash_bucket(dp, port_no); hlist_for_each_entry_rcu(vport, head, dp_hash_node, lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } return NULL; } /* Called with ovs_mutex. */ static struct vport *new_vport(const struct vport_parms *parms) { struct vport *vport; vport = ovs_vport_add(parms); if (!IS_ERR(vport)) { struct datapath *dp = parms->dp; struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); hlist_add_head_rcu(&vport->dp_hash_node, head); } return vport; } static void ovs_vport_update_upcall_stats(struct sk_buff *skb, const struct dp_upcall_info *upcall_info, bool upcall_result) { struct vport *p = OVS_CB(skb)->input_vport; struct vport_upcall_stats_percpu *stats; if (upcall_info->cmd != OVS_PACKET_CMD_MISS && upcall_info->cmd != OVS_PACKET_CMD_ACTION) return; stats = this_cpu_ptr(p->upcall_stats); u64_stats_update_begin(&stats->syncp); if (upcall_result) u64_stats_inc(&stats->n_success); else u64_stats_inc(&stats->n_fail); u64_stats_update_end(&stats->syncp); } void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); /* First drop references to device. */ hlist_del_rcu(&p->dp_hash_node); /* Then destroy it. */ ovs_vport_del(p); } /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage); const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; bool ovs_pcpu_locked = false; u64 *stats_counter; u32 n_mask_hit; u32 n_cache_hit; int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; if (OVS_CB(skb)->upcall_pid) upcall.portid = OVS_CB(skb)->upcall_pid; else if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id()); else upcall.portid = ovs_vport_find_upcall_portid(p, skb); upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); switch (error) { case 0: case -EAGAIN: case -ERESTARTSYS: case -EINTR: consume_skb(skb); break; default: kfree_skb(skb); break; } stats_counter = &stats->n_missed; goto out; } ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); /* This path can be invoked recursively: Use the current task to * identify recursive invocation - the lock must be acquired only once. * Even with disabled bottom halves this can be preempted on PREEMPT_RT. * Limit the locking to RT to avoid assigning `owner' if it can be * avoided. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); ovs_pcpu->owner = current; ovs_pcpu_locked = true; } error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", ovs_dp_name(dp), error); if (ovs_pcpu_locked) { ovs_pcpu->owner = NULL; local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); } stats_counter = &stats->n_hit; out: /* Update datapath statistics. */ u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { struct dp_stats_percpu *stats; int err; if (trace_ovs_dp_upcall_enabled()) trace_ovs_dp_upcall(dp, skb, key, upcall_info); if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; } if (!skb_is_gso(skb)) err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); ovs_vport_update_upcall_stats(skb, upcall_info, !err); if (err) goto err; return 0; err: stats = this_cpu_ptr(dp->stats_percpu); u64_stats_update_begin(&stats->syncp); stats->n_lost++; u64_stats_update_end(&stats->syncp); return err; } static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); if (segs == NULL) return -EINVAL; if (gso_type & SKB_GSO_UDP) { /* The initial flow key extracted by ovs_flow_key_extract() * in this case is for a first fragment, so we need to * properly mark later fragments. */ later_key = *key; later_key.ip.frag = OVS_FRAG_TYPE_LATER; } /* Queue all of the segments. */ skb_list_walk_safe(segs, skb, nskb) { if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; } /* Free all of the segments. */ skb_list_walk_safe(segs, skb, nskb) { if (err) kfree_skb(skb); else consume_skb(skb); } return err; } static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, unsigned int hdrlen, int actions_attrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) size += NLA_ALIGN(upcall_info->userdata->nla_len); /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ if (upcall_info->egress_tun_info) size += nla_total_size(ovs_tun_key_attr_size()); /* OVS_PACKET_ATTR_ACTIONS */ if (upcall_info->actions_len) size += nla_total_size(actions_attrlen); /* OVS_PACKET_ATTR_MRU */ if (upcall_info->mru) size += nla_total_size(sizeof(upcall_info->mru)); return size; } static void pad_packet(struct datapath *dp, struct sk_buff *skb) { if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { size_t plen = NLA_ALIGN(skb->len) - skb->len; if (plen > 0) skb_put_zero(skb, plen); } } static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; size_t len; unsigned int hlen; int err, dp_ifindex; u64 hash; dp_ifindex = get_dpifindex(dp); if (!dp_ifindex) return -ENODEV; if (skb_vlan_tag_present(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; nskb = __vlan_hwaccel_push_inside(nskb); if (!nskb) return -ENOMEM; skb = nskb; } if (nla_attr_size(skb->len) > USHRT_MAX) { err = -EFBIG; goto out; } /* Complete checksum if needed */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_csum_hwoffload_help(skb, 0))) goto out; /* Older versions of OVS user space enforce alignment of the last * Netlink attribute to NLA_ALIGNTO which would require extensive * padding logic. Only perform zerocopy if padding is not required. */ if (dp->user_features & OVS_DP_F_UNALIGNED) hlen = skb_zerocopy_headlen(skb); else hlen = skb->len; len = upcall_msg_size(upcall_info, hlen - cutlen, OVS_CB(skb)->acts_origlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; } upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, 0, upcall_info->cmd); if (!upcall) { err = -EINVAL; goto out; } upcall->dp_ifindex = dp_ifindex; err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); if (err) goto out; if (upcall_info->userdata) __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); if (upcall_info->egress_tun_info) { nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); if (!nla) { err = -EMSGSIZE; goto out; } err = ovs_nla_put_tunnel_info(user_skb, upcall_info->egress_tun_info); if (err) goto out; nla_nest_end(user_skb, nla); } if (upcall_info->actions_len) { nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_ACTIONS); if (!nla) { err = -EMSGSIZE; goto out; } err = ovs_nla_put_actions(upcall_info->actions, upcall_info->actions_len, user_skb); if (!err) nla_nest_end(user_skb, nla); else nla_nest_cancel(user_skb, nla); } /* Add OVS_PACKET_ATTR_MRU */ if (upcall_info->mru && nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { err = -ENOBUFS; goto out; } /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ if (cutlen > 0 && nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { err = -ENOBUFS; goto out; } /* Add OVS_PACKET_ATTR_HASH */ hash = skb_get_hash_raw(skb); if (skb->sw_hash) hash |= OVS_PACKET_HASH_SW_BIT; if (skb->l4_hash) hash |= OVS_PACKET_HASH_L4_BIT; if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { err = -ENOBUFS; goto out; } /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { err = -ENOBUFS; goto out; } nla->nla_len = nla_attr_size(skb->len - cutlen); err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); if (err) goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ pad_packet(dp, user_skb); ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); user_skb = NULL; out: if (err) skb_tx_error(skb); consume_skb(user_skb); consume_skb(nskb); return err; } static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct datapath *dp; struct vport *input_vport; u16 mru = 0; u64 hash; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || !a[OVS_PACKET_ATTR_ACTIONS]) goto err; len = nla_len(a[OVS_PACKET_ATTR_PACKET]); packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); err = -ENOMEM; if (!packet) goto err; skb_reserve(packet, NET_IP_ALIGN); nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); /* Set packet's mru */ if (a[OVS_PACKET_ATTR_MRU]) { mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); packet->ignore_df = 1; } OVS_CB(packet)->mru = mru; if (a[OVS_PACKET_ATTR_HASH]) { hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); __skb_set_hash(packet, hash & 0xFFFFFFFFULL, !!(hash & OVS_PACKET_HASH_SW_BIT), !!(hash & OVS_PACKET_HASH_L4_BIT)); } OVS_CB(packet)->upcall_pid = nla_get_u32_default(a[OVS_PACKET_ATTR_UPCALL_PID], 0); /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); if (IS_ERR(flow)) goto err_kfree_skb; err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], packet, &flow->key, log); if (err) goto err_flow_free; err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts, log); if (err) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); if (!input_vport) input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); if (!input_vport) goto err_unlock; packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); if (IS_ENABLED(CONFIG_PREEMPT_RT)) this_cpu_write(ovs_pcpu_storage->owner, current); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); if (IS_ENABLED(CONFIG_PREEMPT_RT)) this_cpu_write(ovs_pcpu_storage->owner, NULL); local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); local_bh_enable(); rcu_read_unlock(); ovs_flow_free(flow, false); return err; err_unlock: rcu_read_unlock(); err_flow_free: ovs_flow_free(flow, false); err_kfree_skb: kfree_skb(packet); err: return err; } static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, [OVS_PACKET_ATTR_UPCALL_PID] = { .type = NLA_U32 }, }; static const struct genl_small_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_packet_cmd_execute } }; static struct genl_family dp_packet_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, .maxattr = OVS_PACKET_ATTR_MAX, .policy = packet_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_packet_genl_ops, .n_small_ops = ARRAY_SIZE(dp_packet_genl_ops), .resv_start_op = OVS_PACKET_CMD_EXECUTE + 1, .module = THIS_MODULE, }; static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, struct ovs_dp_megaflow_stats *mega_stats) { int i; memset(mega_stats, 0, sizeof(*mega_stats)); stats->n_flows = ovs_flow_tbl_count(&dp->table); mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); stats->n_hit = stats->n_missed = stats->n_lost = 0; for_each_possible_cpu(i) { const struct dp_stats_percpu *percpu_stats; struct dp_stats_percpu local_stats; unsigned int start; percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { start = u64_stats_fetch_begin(&percpu_stats->syncp); local_stats = *percpu_stats; } while (u64_stats_fetch_retry(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; mega_stats->n_cache_hit += local_stats.n_cache_hit; } } static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags) { return ovs_identifier_is_ufid(sfid) && !(ufid_flags & OVS_UFID_F_OMIT_KEY); } static bool should_fill_mask(uint32_t ufid_flags) { return !(ufid_flags & OVS_UFID_F_OMIT_MASK); } static bool should_fill_actions(uint32_t ufid_flags) { return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS); } static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, const struct sw_flow_id *sfid, uint32_t ufid_flags) { size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback * see ovs_nla_put_identifier() */ if (sfid && ovs_identifier_is_ufid(sfid)) len += nla_total_size(sfid->ufid_len); else len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_KEY */ if (!sfid || should_fill_key(sfid, ufid_flags)) len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_MASK */ if (should_fill_mask(ufid_flags)) len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_ACTIONS */ if (should_fill_actions(ufid_flags)) len += nla_total_size(acts->orig_len); return len + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */ } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, struct sk_buff *skb) { struct ovs_flow_stats stats; __be16 tcp_flags; unsigned long used; ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used), OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if (stats.n_packets && nla_put_64bit(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats, OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if ((u8)ntohs(tcp_flags) && nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) return -EMSGSIZE; return 0; } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, struct sk_buff *skb, int skb_orig_len) { struct nlattr *start; int err; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if * this is the first flow to be dumped into 'skb'. This is unusual for * Netlink but individual action lists can be longer than * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. * The userspace caller can always fetch the actions separately if it * really wants them. (Most userspace callers in fact don't care.) * * This can only fail for dump operations because the skb is always * properly sized for single flows. */ start = nla_nest_start_noflag(skb, OVS_FLOW_ATTR_ACTIONS); if (start) { const struct sw_flow_actions *sf_acts; sf_acts = rcu_dereference_ovsl(flow->sf_acts); err = ovs_nla_put_actions(sf_acts->actions, sf_acts->actions_len, skb); if (!err) nla_nest_end(skb, start); else { if (skb_orig_len) return err; nla_nest_cancel(skb, start); } } else if (skb_orig_len) { return -EMSGSIZE; } return 0; } /* Called with ovs_mutex or RCU read lock. */ static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd, u32 ufid_flags) { const int skb_orig_len = skb->len; struct ovs_header *ovs_header; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; ovs_header->dp_ifindex = dp_ifindex; err = ovs_nla_put_identifier(flow, skb); if (err) goto error; if (should_fill_key(&flow->id, ufid_flags)) { err = ovs_nla_put_masked_key(flow, skb); if (err) goto error; } if (should_fill_mask(ufid_flags)) { err = ovs_nla_put_mask(flow, skb); if (err) goto error; } err = ovs_flow_cmd_fill_stats(flow, skb); if (err) goto error; if (should_fill_actions(ufid_flags)) { err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); if (err) goto error; } genlmsg_end(skb, ovs_header); return 0; error: genlmsg_cancel(skb, ovs_header); return err; } /* May not be called with RCU read lock. */ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, const struct sw_flow_id *sfid, struct genl_info *info, bool always, uint32_t ufid_flags) { struct sk_buff *skb; size_t len; if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) return NULL; len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); skb = genlmsg_new(len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); return skb; } /* Called with ovs_mutex. */ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, int dp_ifindex, struct genl_info *info, u8 cmd, bool always, u32 ufid_flags) { struct sk_buff *skb; int retval; skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), &flow->id, info, always, ufid_flags); if (IS_ERR_OR_NULL(skb)) return skb; retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, info->snd_portid, info->snd_seq, 0, cmd, ufid_flags); if (WARN_ON_ONCE(retval < 0)) { kfree_skb(skb); skb = ERR_PTR(retval); } return skb; } static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow *flow = NULL, *new_flow; struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; struct sw_flow_key *key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error; bool log = !a[OVS_FLOW_ATTR_PROBE]; /* Must have key and actions. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) { OVS_NLERR(log, "Flow key attr not present in new flow."); goto error; } if (!a[OVS_FLOW_ATTR_ACTIONS]) { OVS_NLERR(log, "Flow actions attr not present in new flow."); goto error; } /* Most of the time we need to allocate a new flow, do it before * locking. */ new_flow = ovs_flow_alloc(); if (IS_ERR(new_flow)) { error = PTR_ERR(new_flow); goto error; } /* Extract key. */ key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) { error = -ENOMEM; goto err_kfree_flow; } ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_key; ovs_flow_mask_key(&new_flow->key, key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], key, log); if (error) goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_kfree_acts; } ovs_lock(); dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; } /* Check if this is a duplicate flow */ if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) flow = ovs_flow_tbl_lookup(&dp->table, key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); /* Put flow in bucket. */ error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); if (unlikely(error)) { acts = NULL; goto err_unlock_ovs; } if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(new_flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_NEW, ufid_flags); BUG_ON(error < 0); } ovs_unlock(); } else { struct sw_flow_actions *old_acts; /* Bail out if we're not allowed to modify an existing flow. * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL * because Generic Netlink treats the latter as a dump * request. We also accept NLM_F_EXCL in case that bug ever * gets fixed. */ if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))) { error = -EEXIST; goto err_unlock_ovs; } /* The flow identifier has to be the same for flow updates. * Look for any overlapping flow. */ if (unlikely(!ovs_flow_cmp(flow, &match))) { if (ovs_identifier_is_key(&flow->id)) flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); else /* UFID matches but key is different */ flow = NULL; if (!flow) { error = -ENOENT; goto err_unlock_ovs; } } /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_NEW, ufid_flags); BUG_ON(error < 0); } ovs_unlock(); ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } if (reply) ovs_notify(&dp_flow_genl_family, reply, info); kfree(key); return 0; err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); err_kfree_key: kfree(key); err_kfree_flow: ovs_flow_free(new_flow, false); error: return error; } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net, const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask, bool log) { struct sw_flow_actions *acts; struct sw_flow_key masked_key; int error; ovs_flow_mask_key(&masked_key, key, true, mask); error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, "Actions may not be safe on all matching packets"); return ERR_PTR(error); } return acts; } /* Factor out match-init and action-copy to avoid * "Wframe-larger-than=1024" warning. Because mask is only * used to get actions, we new a function to save some * stack space. * * If there are not key and action attrs, we return 0 * directly. In the case, the caller will also not use the * match as before. If there is action attr, we try to get * actions and save them to *acts. Before returning from * the function, we reset the match->mask pointer. Because * we should not to return match object with dangling reference * to mask. * */ static noinline_for_stack int ovs_nla_init_match_and_action(struct net *net, struct sw_flow_match *match, struct sw_flow_key *key, struct nlattr **a, struct sw_flow_actions **acts, bool log) { struct sw_flow_mask mask; int error = 0; if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(match, key, true, &mask); error = ovs_nla_get_match(net, match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; } if (a[OVS_FLOW_ATTR_ACTIONS]) { if (!a[OVS_FLOW_ATTR_KEY]) { OVS_NLERR(log, "Flow key attribute not present in set flow."); error = -EINVAL; goto error; } *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key, &mask, log); if (IS_ERR(*acts)) { error = PTR_ERR(*acts); goto error; } } /* On success, error is 0. */ error: match->mask = NULL; return error; } static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow_key key; struct sw_flow *flow; struct sk_buff *reply = NULL; struct datapath *dp; struct sw_flow_actions *old_acts = NULL, *acts = NULL; struct sw_flow_match match; struct sw_flow_id sfid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); if (!a[OVS_FLOW_ATTR_KEY] && !ufid_present) { OVS_NLERR(log, "Flow set message rejected, Key attribute missing."); return -EINVAL; } error = ovs_nla_init_match_and_action(net, &match, &key, a, &acts, log); if (error) goto error; if (acts) { /* Can allocate before locking if have acts. */ reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_kfree_acts; } } ovs_lock(); dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; } /* Check that the flow exists. */ if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { error = -ENOENT; goto err_unlock_ovs; } /* Update actions, if present. */ if (likely(acts)) { old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); if (unlikely(reply)) { error = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_SET, ufid_flags); BUG_ON(error < 0); } } else { /* Could not alloc without acts before locking. */ reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, OVS_FLOW_CMD_SET, false, ufid_flags); if (IS_ERR(reply)) { error = PTR_ERR(reply); goto err_unlock_ovs; } } /* Clear stats. */ if (a[OVS_FLOW_ATTR_CLEAR]) ovs_flow_stats_clear(flow); ovs_unlock(); if (reply) ovs_notify(&dp_flow_genl_family, reply, info); if (old_acts) ovs_nla_free_flow_actions_rcu(old_acts); return 0; err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); error: return error; } static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow; struct datapath *dp; struct sw_flow_match match; struct sw_flow_id ufid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int err = 0; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { OVS_NLERR(log, "Flow get message rejected, Key attribute missing."); err = -EINVAL; } if (err) return err; ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto unlock; } if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (!flow) { err = -ENOENT; goto unlock; } reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, OVS_FLOW_CMD_GET, true, ufid_flags); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto unlock; } ovs_unlock(); return genlmsg_reply(reply, info); unlock: ovs_unlock(); return err; } static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow = NULL; struct datapath *dp; struct sw_flow_match match; struct sw_flow_id ufid; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int err; bool log = !a[OVS_FLOW_ATTR_PROBE]; bool ufid_present; ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (unlikely(err)) return err; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (unlikely(!dp)) { err = -ENODEV; goto unlock; } if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { err = ovs_flow_tbl_flush(&dp->table); goto unlock; } if (ufid_present) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); else flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (unlikely(!flow)) { err = -ENOENT; goto unlock; } ovs_flow_tbl_remove(&dp->table, flow); ovs_unlock(); reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, &flow->id, info, false, ufid_flags); if (likely(reply)) { if (!IS_ERR(reply)) { rcu_read_lock(); /*To keep RCU checker happy. */ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_DEL, ufid_flags); rcu_read_unlock(); if (WARN_ON_ONCE(err < 0)) { kfree_skb(reply); goto out_free; } ovs_notify(&dp_flow_genl_family, reply, info); } else { netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); } } out_free: ovs_flow_free(flow, true); return 0; unlock: ovs_unlock(); return err; } static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *a[__OVS_FLOW_ATTR_MAX]; struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); struct table_instance *ti; struct datapath *dp; u32 ufid_flags; int err; err = genlmsg_parse_deprecated(cb->nlh, &dp_flow_genl_family, a, OVS_FLOW_ATTR_MAX, flow_policy, NULL); if (err) return err; ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); rcu_read_lock(); dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; } ti = rcu_dereference(dp->table.ti); for (;;) { struct sw_flow *flow; u32 bucket, obj; bucket = cb->args[0]; obj = cb->args[1]; flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); if (!flow) break; if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_FLOW_CMD_GET, ufid_flags) < 0) break; cb->args[0] = bucket; cb->args[1] = obj; } rcu_read_unlock(); return skb->len; } static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 }, [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, }; static const struct genl_small_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_del }, { .cmd = OVS_FLOW_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_flow_cmd_get, .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_set, }, }; static struct genl_family dp_flow_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, .maxattr = OVS_FLOW_ATTR_MAX, .policy = flow_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_flow_genl_ops, .n_small_ops = ARRAY_SIZE(dp_flow_genl_ops), .resv_start_op = OVS_FLOW_CMD_SET + 1, .mcgrps = &ovs_dp_flow_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; static size_t ovs_dp_cmd_msg_size(void) { size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); msgsize += nla_total_size(IFNAMSIZ); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ msgsize += nla_total_size(sizeof(u32) * nr_cpu_ids); /* OVS_DP_ATTR_PER_CPU_PIDS */ return msgsize; } /* Called with ovs_mutex. */ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { struct ovs_header *ovs_header; struct ovs_dp_stats dp_stats; struct ovs_dp_megaflow_stats dp_megaflow_stats; struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids); int err, pids_len; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, flags, cmd); if (!ovs_header) goto error; ovs_header->dp_ifindex = get_dpifindex(dp); err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); if (err) goto nla_put_failure; get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS, sizeof(struct ovs_dp_megaflow_stats), &dp_megaflow_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, ovs_flow_tbl_masks_cache_size(&dp->table))) goto nla_put_failure; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) { pids_len = min(pids->n_pids, nr_cpu_ids) * sizeof(u32); if (nla_put(skb, OVS_DP_ATTR_PER_CPU_PIDS, pids_len, &pids->pids)) goto nla_put_failure; } genlmsg_end(skb, ovs_header); return 0; nla_put_failure: genlmsg_cancel(skb, ovs_header); error: return -EMSGSIZE; } static struct sk_buff *ovs_dp_cmd_alloc_info(void) { return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } /* Called with rcu_read_lock or ovs_mutex. */ static struct datapath *lookup_datapath(struct net *net, const struct ovs_header *ovs_header, struct nlattr *a[OVS_DP_ATTR_MAX + 1]) { struct datapath *dp; if (!a[OVS_DP_ATTR_NAME]) dp = get_dp(net, ovs_header->dp_ifindex); else { struct vport *vport; vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; } return dp ? dp : ERR_PTR(-ENODEV); } static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) { struct datapath *dp; dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); if (IS_ERR(dp)) return; pr_warn("%s: Dropping previously announced user features\n", ovs_dp_name(dp)); dp->user_features = 0; } static int ovs_dp_set_upcall_portids(struct datapath *dp, const struct nlattr *ids) { struct dp_nlsk_pids *old, *dp_nlsk_pids; if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) return -EINVAL; old = ovsl_dereference(dp->upcall_portids); dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), GFP_KERNEL); if (!dp_nlsk_pids) return -ENOMEM; dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); kfree_rcu(old, rcu); return 0; } u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) { struct dp_nlsk_pids *dp_nlsk_pids; dp_nlsk_pids = rcu_dereference(dp->upcall_portids); if (dp_nlsk_pids) { if (cpu_id < dp_nlsk_pids->n_pids) { return dp_nlsk_pids->pids[cpu_id]; } else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) { /* If the number of netlink PIDs is mismatched with * the number of CPUs as seen by the kernel, log this * and send the upcall to an arbitrary socket (0) in * order to not drop packets */ pr_info_ratelimited("cpu_id mismatch with handler threads"); return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids]; } else { return 0; } } else { return 0; } } static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) { u32 user_features = 0, old_features = dp->user_features; int err; if (a[OVS_DP_ATTR_USER_FEATURES]) { user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); if (user_features & ~(OVS_DP_F_VPORT_PIDS | OVS_DP_F_UNALIGNED | OVS_DP_F_TC_RECIRC_SHARING | OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) if (user_features & OVS_DP_F_TC_RECIRC_SHARING) return -EOPNOTSUPP; #endif } if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { int err; u32 cache_size; cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); if (err) return err; } dp->user_features = user_features; if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && a[OVS_DP_ATTR_PER_CPU_PIDS]) { /* Upcall Netlink Port IDs have been updated */ err = ovs_dp_set_upcall_portids(dp, a[OVS_DP_ATTR_PER_CPU_PIDS]); if (err) return err; } if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && !(old_features & OVS_DP_F_TC_RECIRC_SHARING)) tc_skb_ext_tc_enable(); else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && (old_features & OVS_DP_F_TC_RECIRC_SHARING)) tc_skb_ext_tc_disable(); return 0; } static int ovs_dp_stats_init(struct datapath *dp) { dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); if (!dp->stats_percpu) return -ENOMEM; return 0; } static int ovs_dp_vport_init(struct datapath *dp) { int i; dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, sizeof(struct hlist_head), GFP_KERNEL); if (!dp->ports) return -ENOMEM; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) INIT_HLIST_HEAD(&dp->ports[i]); return 0; } static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct vport_parms parms; struct sk_buff *reply; struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) goto err_destroy_reply; ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); if (err) goto err_destroy_dp; err = ovs_dp_stats_init(dp); if (err) goto err_destroy_table; err = ovs_dp_vport_init(dp); if (err) goto err_destroy_stats; err = ovs_meters_init(dp); if (err) goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); parms.type = OVS_VPORT_TYPE_INTERNAL; parms.options = NULL; parms.dp = dp; parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; parms.desired_ifindex = nla_get_s32_default(a[OVS_DP_ATTR_IFINDEX], 0); /* So far only local changes have been made, now need the lock. */ ovs_lock(); err = ovs_dp_change(dp, a); if (err) goto err_unlock_and_destroy_meters; vport = new_vport(&parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); if (err == -EBUSY) err = -EEXIST; if (err == -EEXIST) { /* An outdated user space instance that does not understand * the concept of user_features has attempted to create a new * datapath and is likely to reuse it. Drop all user features. */ if (info->genlhdr->version < OVS_DP_VER_FEATURES) ovs_dp_reset_user_features(skb, info); } goto err_destroy_portids; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_NEW); BUG_ON(err < 0); ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); list_add_tail_rcu(&dp->list_node, &ovs_net->dps); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_destroy_portids: kfree(rcu_dereference_raw(dp->upcall_portids)); err_unlock_and_destroy_meters: ovs_unlock(); ovs_meters_exit(dp); err_destroy_ports: kfree(dp->ports); err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); err_destroy_dp: kfree(dp); err_destroy_reply: kfree_skb(reply); err: return err; } /* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { struct flow_table *table = &dp->table; int i; if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) tc_skb_ext_tc_disable(); for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) if (vport->port_no != OVSP_LOCAL) ovs_dp_detach_port(vport); } list_del_rcu(&dp->list_node); /* OVSP_LOCAL is datapath internal port. We need to make sure that * all ports in datapath are destroyed first before freeing datapath. */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); /* Flush sw_flow in the tables. RCU cb only releases resource * such as dp, ports and tables. That may avoid some issues * such as RCU usage warning. */ table_instance_flow_flush(table, ovsl_dereference(table->ti), ovsl_dereference(table->ufid_ti)); /* RCU destroy the ports, meters and flow tables. */ call_rcu(&dp->rcu, destroy_dp_rcu); } static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_DEL); BUG_ON(err < 0); __dp_destroy(dp); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; err = ovs_dp_change(dp, info->attrs); if (err) goto err_unlock_free; err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_SET); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_datapath_genl_family, reply, info); return 0; err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *reply; struct datapath *dp; int err; reply = ovs_dp_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); goto err_unlock_free; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_GET); BUG_ON(err < 0); ovs_unlock(); return genlmsg_reply(reply, info); err_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); struct datapath *dp; int skip = cb->args[0]; int i = 0; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_DP_CMD_GET) < 0) break; i++; } ovs_unlock(); cb->args[0] = i; return skb->len; } static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), [OVS_DP_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), }; static const struct genl_small_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_del }, { .cmd = OVS_DP_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_dp_cmd_get, .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_set, }, }; static struct genl_family dp_datapath_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, .maxattr = OVS_DP_ATTR_MAX, .policy = datapath_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_datapath_genl_ops, .n_small_ops = ARRAY_SIZE(dp_datapath_genl_ops), .resv_start_op = OVS_DP_CMD_SET + 1, .mcgrps = &ovs_dp_datapath_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, struct net *net, u32 portid, u32 seq, u32 flags, u8 cmd, gfp_t gfp) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, flags, cmd); if (!ovs_header) return -EMSGSIZE; ovs_header->dp_ifindex = get_dpifindex(vport->dp); if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, ovs_vport_name(vport)) || nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; rcu_read_lock(); net_vport = dev_net_rcu(vport->dev); if (!net_eq(net, net_vport)) { int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) goto nla_put_failure_unlock; } rcu_read_unlock(); ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), &vport_stats, OVS_VPORT_ATTR_PAD)) goto nla_put_failure; if (ovs_vport_get_upcall_stats(vport, skb)) goto nla_put_failure; if (ovs_vport_get_upcall_portids(vport, skb)) goto nla_put_failure; err = ovs_vport_get_options(vport, skb); if (err == -EMSGSIZE) goto error; genlmsg_end(skb, ovs_header); return 0; nla_put_failure_unlock: rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: genlmsg_cancel(skb, ovs_header); return err; } static struct sk_buff *ovs_vport_cmd_alloc_info(void) { return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); } /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd) { struct sk_buff *skb; int retval; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd, GFP_KERNEL); BUG_ON(retval < 0); return skb; } /* Called with ovs_mutex or RCU read lock. */ static struct vport *lookup_vport(struct net *net, const struct ovs_header *ovs_header, struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) { struct datapath *dp; struct vport *vport; if (a[OVS_VPORT_ATTR_IFINDEX]) return ERR_PTR(-EOPNOTSUPP); if (a[OVS_VPORT_ATTR_NAME]) { vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); if (!vport) return ERR_PTR(-ENODEV); if (ovs_header->dp_ifindex && ovs_header->dp_ifindex != get_dpifindex(vport->dp)) return ERR_PTR(-ENODEV); return vport; } else if (a[OVS_VPORT_ATTR_PORT_NO]) { u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); if (port_no >= DP_MAX_PORTS) return ERR_PTR(-EFBIG); dp = get_dp(net, ovs_header->dp_ifindex); if (!dp) return ERR_PTR(-ENODEV); vport = ovs_vport_ovsl_rcu(dp, port_no); if (!vport) return ERR_PTR(-ENODEV); return vport; } else return ERR_PTR(-EINVAL); } static unsigned int ovs_get_max_headroom(struct datapath *dp) { unsigned int dev_headroom, max_headroom = 0; struct net_device *dev; struct vport *vport; int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, lockdep_ovsl_is_held()) { dev = vport->dev; dev_headroom = netdev_get_fwd_headroom(dev); if (dev_headroom > max_headroom) max_headroom = dev_headroom; } } return max_headroom; } /* Called with ovs_mutex */ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) { struct vport *vport; int i; dp->max_headroom = new_headroom; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, lockdep_ovsl_is_held()) netdev_set_rx_headroom(vport->dev, new_headroom); } } static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct vport_parms parms; struct sk_buff *reply; struct vport *vport; struct datapath *dp; unsigned int new_headroom; u32 port_no; int err; if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) return -EINVAL; parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL) return -EOPNOTSUPP; port_no = nla_get_u32_default(a[OVS_VPORT_ATTR_PORT_NO], 0); if (port_no >= DP_MAX_PORTS) return -EFBIG; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); restart: dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto exit_unlock_free; if (port_no) { vport = ovs_vport_ovsl(dp, port_no); err = -EBUSY; if (vport) goto exit_unlock_free; } else { for (port_no = 1; ; port_no++) { if (port_no >= DP_MAX_PORTS) { err = -EFBIG; goto exit_unlock_free; } vport = ovs_vport_ovsl(dp, port_no); if (!vport) break; } } parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; parms.desired_ifindex = nla_get_s32_default(a[OVS_VPORT_ATTR_IFINDEX], 0); vport = new_vport(&parms); err = PTR_ERR(vport); if (IS_ERR(vport)) { if (err == -EAGAIN) goto restart; goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW, GFP_KERNEL); new_headroom = netdev_get_fwd_headroom(vport->dev); if (new_headroom > dp->max_headroom) ovs_update_headroom(dp, new_headroom); else netdev_set_rx_headroom(vport->dev, dp->max_headroom); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct sk_buff *reply; struct vport *vport; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; if (a[OVS_VPORT_ATTR_TYPE] && nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { err = -EINVAL; goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_OPTIONS]) { err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); if (err) goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_UPCALL_PID]) { struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; err = ovs_vport_set_upcall_portids(vport, ids); if (err) goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_SET, GFP_KERNEL); BUG_ON(err < 0); ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { bool update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; struct datapath *dp; struct vport *vport; unsigned int new_headroom; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; ovs_lock(); vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; if (vport->port_no == OVSP_LOCAL) { err = -EINVAL; goto exit_unlock_free; } err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_DEL, GFP_KERNEL); BUG_ON(err < 0); /* the vport deletion may trigger dp headroom update */ dp = vport->dp; if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) update_headroom = true; netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); if (update_headroom) { new_headroom = ovs_get_max_headroom(dp); if (new_headroom < dp->max_headroom) ovs_update_headroom(dp, new_headroom); } ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); return 0; exit_unlock_free: ovs_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *reply; struct vport *vport; int err; reply = ovs_vport_cmd_alloc_info(); if (!reply) return -ENOMEM; rcu_read_lock(); vport = lookup_vport(sock_net(skb->sk), ovs_header, a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_GET, GFP_ATOMIC); BUG_ON(err < 0); rcu_read_unlock(); return genlmsg_reply(reply, info); exit_unlock_free: rcu_read_unlock(); kfree_skb(reply); return err; } static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); struct datapath *dp; int bucket = cb->args[0], skip = cb->args[1]; int i, j = 0; rcu_read_lock(); dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { rcu_read_unlock(); return -ENODEV; } for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; j = 0; hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { if (j >= skip && ovs_vport_cmd_fill_info(vport, skb, sock_net(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, OVS_VPORT_CMD_GET, GFP_ATOMIC) < 0) goto out; j++; } skip = 0; } out: rcu_read_unlock(); cb->args[0] = i; cb->args[1] = j; return skb->len; } static void ovs_dp_masks_rebalance(struct work_struct *work) { struct ovs_net *ovs_net = container_of(work, struct ovs_net, masks_rebalance.work); struct datapath *dp; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) ovs_flow_masks_rebalance(&dp->table); ovs_unlock(); schedule_delayed_work(&ovs_net->masks_rebalance, msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); } static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, }; static const struct genl_small_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_del }, { .cmd = OVS_VPORT_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_vport_cmd_get, .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_set, }, }; struct genl_family dp_vport_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, .maxattr = OVS_VPORT_ATTR_MAX, .policy = vport_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_vport_genl_ops, .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops), .resv_start_op = OVS_VPORT_CMD_SET + 1, .mcgrps = &ovs_dp_vport_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; static struct genl_family * const dp_genl_families[] = { &dp_datapath_genl_family, &dp_vport_genl_family, &dp_flow_genl_family, &dp_packet_genl_family, &dp_meter_genl_family, #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) &dp_ct_limit_genl_family, #endif }; static void dp_unregister_genl(int n_families) { int i; for (i = 0; i < n_families; i++) genl_unregister_family(dp_genl_families[i]); } static int __init dp_register_genl(void) { int err; int i; for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { err = genl_register_family(dp_genl_families[i]); if (err) goto error; } return 0; error: dp_unregister_genl(i); return err; } static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); int err; INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); err = ovs_ct_init(net); if (err) return err; schedule_delayed_work(&ovs_net->masks_rebalance, msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); return 0; } static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct list_head *head) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); struct datapath *dp; list_for_each_entry(dp, &ovs_net->dps, list_node) { int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } } } static void __net_exit ovs_exit_net(struct net *dnet) { struct datapath *dp, *dp_next; struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); struct vport *vport, *vport_next; struct net *net; LIST_HEAD(head); ovs_lock(); ovs_ct_exit(dnet); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); up_read(&net_rwsem); /* Detach all vports from given namespace. */ list_for_each_entry_safe(vport, vport_next, &head, detach_list) { list_del(&vport->detach_list); ovs_dp_detach_port(vport); } ovs_unlock(); cancel_delayed_work_sync(&ovs_net->masks_rebalance); cancel_work_sync(&ovs_net->dp_notify_work); } static struct pernet_operations ovs_net_ops = { .init = ovs_init_net, .exit = ovs_exit_net, .id = &ovs_net_id, .size = sizeof(struct ovs_net), }; static const char * const ovs_drop_reasons[] = { #define S(x) [(x) & ~SKB_DROP_REASON_SUBSYS_MASK] = (#x), OVS_DROP_REASONS(S) #undef S }; static struct drop_reason_list drop_reason_list_ovs = { .reasons = ovs_drop_reasons, .n_reasons = ARRAY_SIZE(ovs_drop_reasons), }; static int __init ovs_alloc_percpu_storage(void) { unsigned int cpu; ovs_pcpu_storage = alloc_percpu(*ovs_pcpu_storage); if (!ovs_pcpu_storage) return -ENOMEM; for_each_possible_cpu(cpu) { struct ovs_pcpu_storage *ovs_pcpu; ovs_pcpu = per_cpu_ptr(ovs_pcpu_storage, cpu); local_lock_init(&ovs_pcpu->bh_lock); } return 0; } static void ovs_free_percpu_storage(void) { free_percpu(ovs_pcpu_storage); } static int __init dp_init(void) { int err; BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); err = ovs_alloc_percpu_storage(); if (err) goto error; err = ovs_internal_dev_rtnl_link_register(); if (err) goto error; err = ovs_flow_init(); if (err) goto error_unreg_rtnl_link; err = ovs_vport_init(); if (err) goto error_flow_exit; err = register_pernet_device(&ovs_net_ops); if (err) goto error_vport_exit; err = register_netdevice_notifier(&ovs_dp_device_notifier); if (err) goto error_netns_exit; err = ovs_netdev_init(); if (err) goto error_unreg_notifier; err = dp_register_genl(); if (err < 0) goto error_unreg_netdev; drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH, &drop_reason_list_ovs); return 0; error_unreg_netdev: ovs_netdev_exit(); error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: unregister_pernet_device(&ovs_net_ops); error_vport_exit: ovs_vport_exit(); error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); error: ovs_free_percpu_storage(); return err; } static void dp_cleanup(void) { dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH); rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); ovs_free_percpu_storage(); } module_init(dp_init); module_exit(dp_cleanup); MODULE_DESCRIPTION("Open vSwitch switching datapath"); MODULE_LICENSE("GPL"); MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY);
2 8 3 5 5 1 1 5 5 5 1 1 5 3 3 3 8 3 3 3 3 7 4 4 4 4 4 6 3 6 6 6 6 6 6 3 3 4 5 3 2 3 3 5 2 2 2 1 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Priority handling * RFC DRAFT ndata section 3.4 */ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); static struct sctp_stream_priorities *sctp_sched_prio_head_get(struct sctp_stream_priorities *p) { p->users++; return p; } static void sctp_sched_prio_head_put(struct sctp_stream_priorities *p) { if (p && --p->users == 0) kfree(p); } static struct sctp_stream_priorities *sctp_sched_prio_new_head( struct sctp_stream *stream, int prio, gfp_t gfp) { struct sctp_stream_priorities *p; p = kmalloc(sizeof(*p), gfp); if (!p) return NULL; INIT_LIST_HEAD(&p->prio_sched); INIT_LIST_HEAD(&p->active); p->next = NULL; p->prio = prio; p->users = 1; return p; } static struct sctp_stream_priorities *sctp_sched_prio_get_head( struct sctp_stream *stream, int prio, gfp_t gfp) { struct sctp_stream_priorities *p; int i; /* Look into scheduled priorities first, as they are sorted and * we can find it fast IF it's scheduled. */ list_for_each_entry(p, &stream->prio_list, prio_sched) { if (p->prio == prio) return sctp_sched_prio_head_get(p); if (p->prio > prio) break; } /* No luck. So we search on all streams now. */ for (i = 0; i < stream->outcnt; i++) { if (!SCTP_SO(stream, i)->ext) continue; p = SCTP_SO(stream, i)->ext->prio_head; if (!p) /* Means all other streams won't be initialized * as well. */ break; if (p->prio == prio) return sctp_sched_prio_head_get(p); } /* If not even there, allocate a new one. */ return sctp_sched_prio_new_head(stream, prio, gfp); } static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p) { struct list_head *pos; pos = p->next->prio_list.next; if (pos == &p->active) pos = pos->next; p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list); } static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute) { bool scheduled = false; if (!list_empty(&soute->prio_list)) { struct sctp_stream_priorities *prio_head = soute->prio_head; /* Scheduled */ scheduled = true; if (prio_head->next == soute) /* Try to move to the next stream */ sctp_sched_prio_next_stream(prio_head); list_del_init(&soute->prio_list); /* Also unsched the priority if this was the last stream */ if (list_empty(&prio_head->active)) { list_del_init(&prio_head->prio_sched); /* If there is no stream left, clear next */ prio_head->next = NULL; } } return scheduled; } static void sctp_sched_prio_sched(struct sctp_stream *stream, struct sctp_stream_out_ext *soute) { struct sctp_stream_priorities *prio, *prio_head; prio_head = soute->prio_head; /* Nothing to do if already scheduled */ if (!list_empty(&soute->prio_list)) return; /* Schedule the stream. If there is a next, we schedule the new * one before it, so it's the last in round robin order. * If there isn't, we also have to schedule the priority. */ if (prio_head->next) { list_add(&soute->prio_list, prio_head->next->prio_list.prev); return; } list_add(&soute->prio_list, &prio_head->active); prio_head->next = soute; list_for_each_entry(prio, &stream->prio_list, prio_sched) { if (prio->prio > prio_head->prio) { list_add(&prio_head->prio_sched, prio->prio_sched.prev); return; } } list_add_tail(&prio_head->prio_sched, &stream->prio_list); } static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, __u16 prio, gfp_t gfp) { struct sctp_stream_out *sout = SCTP_SO(stream, sid); struct sctp_stream_out_ext *soute = sout->ext; struct sctp_stream_priorities *prio_head, *old; bool reschedule = false; old = soute->prio_head; if (old && old->prio == prio) return 0; prio_head = sctp_sched_prio_get_head(stream, prio, gfp); if (!prio_head) return -ENOMEM; reschedule = sctp_sched_prio_unsched(soute); soute->prio_head = prio_head; if (reschedule) sctp_sched_prio_sched(stream, soute); sctp_sched_prio_head_put(old); return 0; } static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = SCTP_SO(stream, sid)->ext->prio_head->prio; return 0; } static int sctp_sched_prio_init(struct sctp_stream *stream) { INIT_LIST_HEAD(&stream->prio_list); return 0; } static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { INIT_LIST_HEAD(&SCTP_SO(stream, sid)->ext->prio_list); return sctp_sched_prio_set(stream, sid, 0, gfp); } static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid) { sctp_sched_prio_head_put(SCTP_SO(stream, sid)->ext->prio_head); SCTP_SO(stream, sid)->ext->prio_head = NULL; } static void sctp_sched_prio_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { struct sctp_stream *stream; struct sctp_chunk *ch; __u16 sid; ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); sid = sctp_chunk_stream_no(ch); stream = &q->asoc->stream; sctp_sched_prio_sched(stream, SCTP_SO(stream, sid)->ext); } static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_stream_priorities *prio; struct sctp_stream_out_ext *soute; struct sctp_chunk *ch = NULL; /* Bail out quickly if queue is empty */ if (list_empty(&q->out_chunk_list)) goto out; /* Find which chunk is next. It's easy, it's either the current * one or the first chunk on the next active stream. */ if (stream->out_curr) { soute = stream->out_curr->ext; } else { prio = list_entry(stream->prio_list.next, struct sctp_stream_priorities, prio_sched); soute = prio->next; } ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { struct sctp_stream_priorities *prio; struct sctp_stream_out_ext *soute; __u16 sid; /* Last chunk on that msg, move to the next stream on * this priority. */ sid = sctp_chunk_stream_no(ch); soute = SCTP_SO(&q->asoc->stream, sid)->ext; prio = soute->prio_head; sctp_sched_prio_next_stream(prio); if (list_empty(&soute->outq)) sctp_sched_prio_unsched(soute); } static void sctp_sched_prio_sched_all(struct sctp_stream *stream) { struct sctp_association *asoc; struct sctp_stream_out *sout; struct sctp_chunk *ch; asoc = container_of(stream, struct sctp_association, stream); list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { __u16 sid; sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(stream, sid); if (sout->ext) sctp_sched_prio_sched(stream, sout->ext); } } static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) { struct sctp_stream_priorities *p, *tmp; struct sctp_stream_out_ext *soute, *souttmp; list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched) list_for_each_entry_safe(soute, souttmp, &p->active, prio_list) sctp_sched_prio_unsched(soute); } static struct sctp_sched_ops sctp_sched_prio = { .set = sctp_sched_prio_set, .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, .init_sid = sctp_sched_prio_init_sid, .free_sid = sctp_sched_prio_free_sid, .enqueue = sctp_sched_prio_enqueue, .dequeue = sctp_sched_prio_dequeue, .dequeue_done = sctp_sched_prio_dequeue_done, .sched_all = sctp_sched_prio_sched_all, .unsched_all = sctp_sched_prio_unsched_all, }; void sctp_sched_ops_prio_init(void) { sctp_sched_ops_register(SCTP_SS_PRIO, &sctp_sched_prio); }
353 355 341 355 355 1 355 339 355 17 16 15 11 14 14 14 14 14 14 13 8 8 18 17 12 19 19 24 24 23 22 21 20 24 19 19 19 19 27 27 24 27 168 167 168 166 94 4 164 163 163 162 163 163 162 161 170 160 157 158 157 156 99 124 45 84 86 85 19 18 15 13 11 10 3 8 8 8 7 7 5 8 21 19 18 7 19 8 19 16 6 3 16 16 4 16 19 8 2 9 10 8 8 5 4 1 8 9 8 8 6 5 6 5 5 4 3 2 2 2 5 25 25 25 25 25 25 25 15 16 14 15 4 3 1 14 10 4 3 3 8 8 32 33 34 35 32 35 35 34 33 35 35 35 35 17 16 2 16 14 14 14 17 3 5 9 21 21 21 22 21 3 22 844 842 844 2597 32 2572 1228 1485 844 843 2026 1577 2577 2565 2569 2557 2564 2313 2482 2009 1767 2477 1396 1144 2481 443 2482 2477 2483 2473 2465 1389 154 154 153 2587 7 7 2595 2494 2586 206 211 210 12 211 2 2 2 2464 25 1 24 2382 2438 95 2436 2434 2433 2429 585 2521 1923 2514 13 12 2509 28 2490 504 2510 684 2514 514 2508 2392 538 574 37 2510 2091 2420 10 2426 14 2420 9 2425 10 2428 11 2421 7 6 2512 67 66 67 67 7 7 7 30 31 31 30 2433 2424 42 2401 2399 702 2221 2394 2427 2396 207 207 2214 2214 38 38 36 35 1 32 32 27 38 19 1603 1606 35 1609 1606 1604 1322 306 303 396 396 473 1235 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/open.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/module.h> #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/audit.h> #include <linux/falloc.h> #include <linux/fs_struct.h> #include <linux/dnotify.h> #include <linux/compat.h> #include <linux/mnt_idmapping.h> #include <linux/filelock.h> #include "internal.h" int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ if (length < 0) return -EINVAL; newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } /* Remove suid, sgid, and file capabilities on truncate too */ ret = dentry_needs_remove_privs(idmap, dentry); if (ret < 0) return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; ret = inode_lock_killable(dentry->d_inode); if (ret) return ret; /* Note any delegations or leases have already been broken: */ ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } int vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; int error; inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode)) return -EINVAL; idmap = mnt_idmap(path->mnt); error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; error = fsnotify_truncate_perm(path, length); if (error) return error; error = mnt_want_write(path->mnt); if (error) return error; error = -EPERM; if (IS_APPEND(inode)) goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = security_path_truncate(path); if (!error) error = do_truncate(idmap, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); return error; } EXPORT_SYMBOL_GPL(vfs_truncate); int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int error; if (length < 0) /* sorry, but loff_t says... */ return -EINVAL; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (!error) { error = vfs_truncate(&path, length); path_put(&path); } if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) { return do_sys_truncate(path, length); } #endif int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; int error; /* explicitly opened as large or we are on 64-bit box */ if (file->f_flags & O_LARGEFILE) small = 0; dentry = file->f_path.dentry; inode = dentry->d_inode; if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) return -EINVAL; /* Cannot ftruncate over 2^31 bytes without large file support */ if (small && length > MAX_NON_LFS) return -EINVAL; /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(file))) return -EPERM; error = security_file_truncate(file); if (error) return error; error = fsnotify_truncate_perm(&file->f_path, length); if (error) return error; sb_start_write(inode->i_sb); error = do_truncate(file_mnt_idmap(file), dentry, length, ATTR_MTIME | ATTR_CTIME, file); sb_end_write(inode->i_sb); return error; } int do_sys_ftruncate(unsigned int fd, loff_t length, int small) { if (length < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return do_ftruncate(fd_file(f), length, small); } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } #endif /* LFS versions of truncate are only needed on 32 bit machines */ #if BITS_PER_LONG == 32 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { return do_sys_truncate(path, length); } SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { return do_sys_ftruncate(fd, length, 0); } #endif /* BITS_PER_LONG == 32 */ #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64) COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname, compat_arg_u64_dual(length)) { return ksys_truncate(pathname, compat_arg_u64_glue(length)); } #endif #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64) COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, compat_arg_u64_dual(length)) { return ksys_ftruncate(fd, compat_arg_u64_glue(length)); } #endif int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); int ret; loff_t sum; if (offset < 0 || len <= 0) return -EINVAL; if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; /* * Modes are exclusive, even if that is not obvious from the encoding * as bit masks and the mix with the flag in the same namespace. * * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is * encoded as no bit set. */ switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_ALLOCATE_RANGE: case FALLOC_FL_UNSHARE_RANGE: case FALLOC_FL_ZERO_RANGE: break; case FALLOC_FL_PUNCH_HOLE: if (!(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; break; case FALLOC_FL_COLLAPSE_RANGE: case FALLOC_FL_INSERT_RANGE: case FALLOC_FL_WRITE_ZEROES: if (mode & FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; break; default: return -EOPNOTSUPP; } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* * On append-only files only space preallocation is supported. */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) return -EPERM; /* * We cannot allow any fallocate operation on an active swapfile */ if (IS_SWAPFILE(inode)) return -ETXTBSY; /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. */ ret = security_file_permission(file, MAY_WRITE); if (ret) return ret; ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len); if (ret) return ret; if (S_ISFIFO(inode->i_mode)) return -ESPIPE; if (S_ISDIR(inode->i_mode)) return -EISDIR; if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -ENODEV; /* Check for wraparound */ if (check_add_overflow(offset, len, &sum)) return -EFBIG; if (sum > inode->i_sb->s_maxbytes) return -EFBIG; if (!file->f_op->fallocate) return -EOPNOTSUPP; file_start_write(file); ret = file->f_op->fallocate(file, mode, offset, len); /* * Create inotify and fanotify events. * * To keep the logic simple always create events if fallocate succeeds. * This implies that events are even created if the file size remains * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. */ if (ret == 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fallocate(fd_file(f), mode, offset, len); } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { return ksys_fallocate(fd, mode, offset, len); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE) COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset), compat_arg_u64_dual(len)) { return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset), compat_arg_u64_glue(len)); } #endif /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. * * Creating new credentials is expensive, so we try to skip doing it, * which we can if the result would match what we already got. */ static bool access_need_override_creds(int flags) { const struct cred *cred; if (flags & AT_EACCESS) return false; cred = current_cred(); if (!uid_eq(cred->fsuid, cred->uid) || !gid_eq(cred->fsgid, cred->gid)) return true; if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(cred->user_ns, 0); if (!uid_eq(cred->uid, root_uid)) { if (!cap_isclear(cred->cap_effective)) return true; } else { if (!cap_isidentical(cred->cap_effective, cred->cap_permitted)) return true; } } return false; } static const struct cred *access_override_creds(void) { struct cred *override_cred; override_cred = prepare_creds(); if (!override_cred) return NULL; /* * XXX access_need_override_creds performs checks in hopes of skipping * this work. Make sure it stays in sync if making any changes in this * routine. */ override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ kuid_t root_uid = make_kuid(override_cred->user_ns, 0); if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = override_cred->cap_permitted; } /* * The new set of credentials can *only* be used in * task-synchronous circumstances, and does not need * RCU freeing, unless somebody then takes a separate * reference to it. * * NOTE! This is _only_ true because this credential * is used purely for override_creds() that installs * it as the subjective cred. Other threads will be * accessing ->real_cred, not the subjective cred. * * If somebody _does_ make a copy of this (using the * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous * cred accesses will keep things non-racy to avoid RCU * freeing. */ override_cred->non_rcu = 1; return override_creds(override_cred); } static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; int res; unsigned int lookup_flags = LOOKUP_FOLLOW; const struct cred *old_cred = NULL; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (access_need_override_creds(flags)) { old_cred = access_override_creds(); if (!old_cred) return -ENOMEM; } retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* * MAY_EXEC on regular files is denied if the fs is mounted * with the "noexec" flag. */ res = -EACCES; if (path_noexec(&path)) goto out_path_release; } res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; /* * This is a rare case where using __mnt_is_readonly() * is OK without a mnt_want/drop_write() pair. Since * no actual write to the fs is performed here, we do * not need to telegraph to that to anyone. * * By doing this, we accept that this access is * inherently racy and know that the fs may change * state before we even see this result. */ if (__mnt_is_readonly(path.mnt)) res = -EROFS; out_path_release: path_put(&path); if (retry_estale(res, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: if (old_cred) put_cred(revert_creds(old_cred)); return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { return do_faccessat(dfd, filename, mode, 0); } SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, int, flags) { return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { return do_faccessat(AT_FDCWD, filename, mode, 0); } SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; set_fs_pwd(current->fs, &path); dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE1(fchdir, unsigned int, fd) { CLASS(fd_raw, f)(fd); int error; if (fd_empty(f)) return -EBADF; if (!d_can_lookup(fd_file(f)->f_path.dentry)) return -ENOTDIR; error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &fd_file(f)->f_path); return error; } SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; retry: error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) goto dput_and_out; set_fs_root(current->fs, &path); error = 0; dput_and_out: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; struct iattr newattrs; int error; error = mnt_want_write(path->mnt); if (error) return error; retry_deleg: error = inode_lock_killable(inode); if (error) goto out_mnt_unlock; error = security_path_chmod(path, mode); if (error) goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } out_mnt_unlock: mnt_drop_write(path->mnt); return error; } int vfs_fchmod(struct file *file, umode_t mode) { audit_file(file); return chmod_common(&file->f_path, mode); } SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fchmod(fd_file(f), mode); } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, unsigned int flags) { struct path path; int error; unsigned int lookup_flags; if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) return -EINVAL; lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { error = chmod_common(&path, mode); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } } return error; } SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, flags) { return do_fchmodat(dfd, filename, mode, flags); } SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * * Return: true if @kuid is valid, false if not. */ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) { if (!uid_valid(kuid)) return false; attr->ia_valid |= ATTR_UID; attr->ia_vfsuid = VFSUIDT_INIT(kuid); return true; } /* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * * Return: true if @kgid is valid, false if not. */ static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid) { if (!gid_valid(kgid)) return false; attr->ia_valid |= ATTR_GID; attr->ia_vfsgid = VFSGIDT_INIT(kgid); return true; } int chown_common(const struct path *path, uid_t user, gid_t group) { struct mnt_idmap *idmap; struct user_namespace *fs_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; struct iattr newattrs; kuid_t uid; kgid_t gid; uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); idmap = mnt_idmap(path->mnt); fs_userns = i_user_ns(inode); retry_deleg: newattrs.ia_vfsuid = INVALID_VFSUID; newattrs.ia_vfsgid = INVALID_VFSGID; newattrs.ia_valid = ATTR_CTIME; if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid)) return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; error = inode_lock_killable(inode); if (error) return error; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | setattr_should_drop_sgid(idmap, inode); /* Continue to send actual fs values, not the mount values. */ error = security_path_chown( path, from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid), from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid)); if (!error) error = notify_change(idmap, path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag) { struct path path; int error = -EINVAL; int lookup_flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); if (error) goto out_release; error = chown_common(&path, user, group); mnt_drop_write(path.mnt); out_release: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: return error; } SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, gid_t, group, int, flag) { return do_fchownat(dfd, filename, user, group, flag); } SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { return do_fchownat(AT_FDCWD, filename, user, group, AT_SYMLINK_NOFOLLOW); } int vfs_fchown(struct file *file, uid_t user, gid_t group) { int error; error = mnt_want_write_file(file); if (error) return error; audit_file(file); error = chown_common(&file->f_path, user, group); mnt_drop_write_file(file); return error; } int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_fchown(fd_file(f), user, group); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { return ksys_fchown(fd, user, group); } static inline int file_get_write_access(struct file *f) { int error; error = get_write_access(f->f_inode); if (unlikely(error)) return error; error = mnt_get_write_access(f->f_path.mnt); if (unlikely(error)) goto cleanup_inode; if (unlikely(f->f_mode & FMODE_BACKING)) { error = mnt_get_write_access(backing_file_user_path(f)->mnt); if (unlikely(error)) goto cleanup_mnt; } return 0; cleanup_mnt: mnt_put_write_access(f->f_path.mnt); cleanup_inode: put_write_access(f->f_inode); return error; } static int do_dentry_open(struct file *f, int (*open)(struct inode *, struct file *)) { static const struct file_operations empty_fops = {}; struct inode *inode = f->f_path.dentry->d_inode; int error; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; f->f_wb_err = filemap_sample_wb_err(f->f_mapping); f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; file_set_fsnotify_mode(f, FMODE_NONOTIFY); f->f_op = &empty_fops; return 0; } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f); if (error) goto cleanup_all; /* * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits * according to existing permission watches. * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a * pseudo file, this call will not change the mode. */ error = fsnotify_open_perm_and_set_mode(f); if (error) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); if (error) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } f->f_mode |= FMODE_OPENED; if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek) f->f_mode &= ~FMODE_LSEEK; if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO) f->f_mode |= FMODE_CAN_ODIRECT; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); f->f_iocb_flags = iocb_flags(f); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; /* * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ if (f->f_mode & FMODE_WRITE) { /* * Depends on full fence from get_write_access() to synchronize * against collapse_file() regarding i_writecount and nr_thps * updates. Ensures subsequent insertion of THPs into the page * cache will fail. */ if (filemap_nr_thps(inode->i_mapping)) { struct address_space *mapping = inode->i_mapping; filemap_invalidate_lock(inode->i_mapping); /* * unmap_mapping_range just need to be called once * here, because the private pages is not need to be * unmapped mapping (e.g. data segment of dynamic * shared libraries here). */ unmap_mapping_range(mapping, 0, 0, 0); truncate_inode_pages(mapping, 0); filemap_invalidate_unlock(inode->i_mapping); } } return 0; cleanup_all: if (WARN_ON_ONCE(error > 0)) error = -EINVAL; fops_put(f->f_op); put_file_access(f); cleanup_file: path_put(&f->f_path); f->__f_path.mnt = NULL; f->__f_path.dentry = NULL; f->f_inode = NULL; return error; } /** * finish_open - finish opening a file * @file: file pointer * @dentry: pointer to dentry * @open: open callback * * This can be used to finish opening a file passed to i_op->atomic_open(). * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. * * NB: the dentry reference is _not_ consumed. If, for example, the dentry is * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)) { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ file->__f_path.dentry = dentry; return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup()) * * This can be used to set the result of a lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * * Returns 0 or -E..., which must be the return value of ->atomic_open() after * having called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { if (IS_ERR(dentry)) return PTR_ERR(dentry); file->__f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); char *file_path(struct file *filp, char *buf, int buflen) { return d_path(&filp->f_path, buf, buflen); } EXPORT_SYMBOL(file_path); /** * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized */ int vfs_open(const struct path *path, struct file *file) { int ret; file->__f_path = *path; ret = do_dentry_open(file, NULL); if (!ret) { /* * Once we return a file with FMODE_OPENED, __fput() will call * fsnotify_close(), so we need fsnotify_open() here for * symmetry. */ fsnotify_open(file); } return ret; } struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; struct file *f; /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } EXPORT_SYMBOL(dentry_open); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred) { struct file *f = alloc_empty_file(flags, cred); if (!IS_ERR(f)) { int error; file_set_fsnotify_mode(f, FMODE_NONOTIFY); error = vfs_open(path, f); if (error) { fput(f); f = ERR_PTR(error); } } return f; } /** * dentry_create - Create and open a file * @path: path to create * @flags: O_ flags * @mode: mode bits for new file * @cred: credentials to use * * Caller must hold the parent directory's lock, and have prepared * a negative dentry, placed in @path->dentry, for the new file. * * Caller sets @path->mnt to the vfsmount of the filesystem where * the new file is to be created. The parent directory and the * negative dentry must reside on the same filesystem instance. * * On success, returns a "struct file *". Otherwise a ERR_PTR * is returned. */ struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; error = vfs_create(mnt_idmap(path->mnt), d_inode(path->dentry->d_parent), path->dentry, mode, true); if (!error) error = vfs_open(path, f); if (unlikely(error)) { fput(f); return ERR_PTR(error); } return f; } EXPORT_SYMBOL(dentry_create); /** * kernel_file_open - open a file for kernel internal use * @path: path of the file to open * @flags: open flags * @cred: credentials for open * * Open a file for use by in-kernel consumers. The file is not accounted * against nr_files and must not be installed into the file descriptor * table. * * Return: Opened file on success, an error pointer on failure. */ struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred) { struct file *f; int error; f = alloc_empty_file_noaccount(flags, cred); if (IS_ERR(f)) return f; error = vfs_open(path, f); if (error) { fput(f); return ERR_PTR(error); } return f; } EXPORT_SYMBOL_GPL(kernel_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, .mode = mode & S_IALLUGO, }; /* O_PATH beats everything else. */ if (how.flags & O_PATH) how.flags &= O_PATH_FLAGS; /* Modes should only be set for create-like flags. */ if (!WILL_CREATE(how.flags)) how.mode = 0; return how; } inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; u64 strip = O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), "struct open_flags doesn't yet handle flags > 32 bits"); /* * Strip flags that aren't relevant in determining struct open_flags. */ flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument * values before calling build_open_flags(), but openat2(2) checks all * of its arguments. */ if (flags & ~VALID_OPEN_FLAGS) return -EINVAL; if (how->resolve & ~VALID_RESOLVE_FLAGS) return -EINVAL; /* Scoping flags are mutually exclusive. */ if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT)) return -EINVAL; /* Deal with the mode. */ if (WILL_CREATE(flags)) { if (how->mode & ~S_IALLUGO) return -EINVAL; op->mode = how->mode | S_IFREG; } else { if (how->mode != 0) return -EINVAL; op->mode = 0; } /* * Block bugs where O_DIRECTORY | O_CREAT created regular files. * Note, that blocking O_DIRECTORY | O_CREAT here also protects * O_TMPFILE below which requires O_DIRECTORY being raised. */ if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT)) return -EINVAL; /* Now handle the creative implementation of O_TMPFILE. */ if (flags & __O_TMPFILE) { /* * In order to ensure programs get explicit errors when trying * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY * is raised alongside __O_TMPFILE. */ if (!(flags & O_DIRECTORY)) return -EINVAL; if (!(acc_mode & MAY_WRITE)) return -EINVAL; } if (flags & O_PATH) { /* O_PATH only permits certain other flags to be set. */ if (flags & ~O_PATH_FLAGS) return -EINVAL; acc_mode = 0; } /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's * always set instead of having to deal with possibly weird behaviour * for malicious applications setting only __O_SYNC. */ if (flags & __O_SYNC) flags |= O_DSYNC; op->open_flag = flags; /* O_TRUNC implies we need access checks for write permissions */ if (flags & O_TRUNC) acc_mode |= MAY_WRITE; /* Allow the LSM permission hook to distinguish append access from general write access. */ if (flags & O_APPEND) acc_mode |= MAY_APPEND; op->acc_mode = acc_mode; op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; flags |= O_NOFOLLOW; } } if (flags & O_DIRECTORY) lookup_flags |= LOOKUP_DIRECTORY; if (!(flags & O_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (how->resolve & RESOLVE_NO_XDEV) lookup_flags |= LOOKUP_NO_XDEV; if (how->resolve & RESOLVE_NO_MAGICLINKS) lookup_flags |= LOOKUP_NO_MAGICLINKS; if (how->resolve & RESOLVE_NO_SYMLINKS) lookup_flags |= LOOKUP_NO_SYMLINKS; if (how->resolve & RESOLVE_BENEATH) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; if (how->resolve & RESOLVE_CACHED) { /* Don't bother even trying for create/truncate/tmpfile open */ if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) return -EAGAIN; lookup_flags |= LOOKUP_CACHED; } op->lookup_flags = lookup_flags; return 0; } /** * file_open_name - open file and return file pointer * * @name: struct filename containing path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *file_open_name(struct filename *name, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_filp_open(AT_FDCWD, name, &op); } /** * filp_open - open file and return file pointer * * @filename: path to open * @flags: open flags as per the open(2) second argument * @mode: mode for the new file if O_CREAT is set, else ignored * * This is the helper to open a file from kernelspace if you really * have to. But in generally you should not do this, so please move * along, nothing to see here.. */ struct file *filp_open(const char *filename, int flags, umode_t mode) { struct filename *name = getname_kernel(filename); struct file *file = ERR_CAST(name); if (!IS_ERR(name)) { file = file_open_name(name, flags, mode); putname(name); } return file; } EXPORT_SYMBOL(filp_open); struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; struct open_how how = build_open_how(flags, mode); int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); static int do_sys_openat2(int dfd, const char __user *filename, struct open_how *how) { struct open_flags op; struct filename *tmp; int err, fd; err = build_open_flags(how, &op); if (unlikely(err)) return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); if (likely(fd >= 0)) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fd_install(fd, f); } } putname(tmp); return fd; } int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); } SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(dfd, filename, flags, mode); } SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, struct open_how __user *, how, size_t, usize) { int err; struct open_how tmp; BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0); BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST); if (unlikely(usize < OPEN_HOW_SIZE_VER0)) return -EINVAL; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize); if (err) return err; audit_openat2_how(&tmp); /* O_LARGEFILE is only allowed for non-O_PATH. */ if (!(tmp.flags & O_PATH) && force_o_largefile()) tmp.flags |= O_LARGEFILE; return do_sys_openat2(dfd, filename, &tmp); } #ifdef CONFIG_COMPAT /* * Exactly like sys_open(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(AT_FDCWD, filename, flags, mode); } /* * Exactly like sys_openat(), except that it doesn't set the * O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { return do_sys_open(dfd, filename, flags, mode); } #endif #ifndef __alpha__ /* * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { int flags = O_CREAT | O_WRONLY | O_TRUNC; if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, pathname, flags, mode); } #endif /* * "id" is the POSIX thread ID. We use the * files pointer for this.. */ static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp, "VFS: Close: file count is 0 (f_op=%ps)", filp->f_op)) { return 0; } if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); if (likely(!(filp->f_mode & FMODE_PATH))) { dnotify_flush(filp, id); locks_remove_posix(filp, id); } return retval; } int filp_close(struct file *filp, fl_owner_t id) { int retval; retval = filp_flush(filp, id); fput_close(filp); return retval; } EXPORT_SYMBOL(filp_close); /* * Careful here! We test whether the file pointer is NULL before * releasing the fd. This ensures that one clone task can't release * an fd while another clone is opening it. */ SYSCALL_DEFINE1(close, unsigned int, fd) { int retval; struct file *file; file = file_close_fd(fd); if (!file) return -EBADF; retval = filp_flush(file, current->files); /* * We're returning to user space. Don't bother * with any delayed fput() cases. */ fput_close_sync(file); if (likely(retval == 0)) return 0; /* can't restart close syscall because file table entry was cleared */ if (retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; } /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. */ SYSCALL_DEFINE0(vhangup) { if (capable(CAP_SYS_TTY_CONFIG)) { tty_vhangup_self(); return 0; } return -EPERM; } /* * Called when an inode is about to be open. * We use this to disallow opening large files on 32bit systems if * the caller didn't specify O_LARGEFILE. On 64bit systems we force * on this flag in sys_open. */ int generic_file_open(struct inode * inode, struct file * filp) { if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable * file descriptors. The function is not supposed to ever fail, the only * reason it returns an 'int' and not 'void' is so that it can be plugged * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); return 0; } EXPORT_SYMBOL(nonseekable_open); /* * stream_open is used by subsystems that want stream-like file descriptors. * Such file descriptors are not seekable and don't have notion of position * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL). * Contrary to file descriptors of other regular files, .read() and .write() * can run simultaneously. * * stream_open never fails and is marked to return int so that it could be * directly used as file_operations.open . */ int stream_open(struct inode *inode, struct file *filp) { filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); filp->f_mode |= FMODE_STREAM; return 0; } EXPORT_SYMBOL(stream_open);
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "internal.h" static int cmdline_proc_show(struct seq_file *m, void *v) { seq_puts(m, saved_command_line); seq_putc(m, '\n'); return 0; } static int __init proc_cmdline_init(void) { struct proc_dir_entry *pde; pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); pde_make_permanent(pde); pde->size = saved_command_line_len + 1; return 0; } fs_initcall(proc_cmdline_init);
14 3 11 30 30 37 37 17 17 15 15 15 20 20 20 22 22 22 20 22 13 13 13 13 16 16 16 15 15 20 4 17 16 1 16 6 5 4 17 16 15 2 2 2 15 3 3 1 13 13 2 16 12 12 4 6 9 9 6 25 3 2 5 18 14 14 14 4 3 3 17 17 17 17 19 20 19 18 18 18 16 16 16 16 16 16 15 20 3 17 17 20 20 20 20 20 20 20 20 9 9 26 20 26 9 9 9 3 2 1 1 3 4 3 1 2 2 2 2 3 4 18 18 18 18 1 1 17 15 2 18 18 18 18 3 18 7 5 5 5 5 11 16 2 7 18 15 13 18 18 7 6 6 4 4 4 3 2 2 12 11 12 20 1 17 17 16 14 14 14 14 14 14 7 7 7 7 4 4 3 3 3 3 3 3 7 20 6 6 5 5 5 5 4 3 2 2 1 6 4 4 3 2 4 3 1 3 2 3 1 1 3 23 11 23 4 11 4 3 3 3 1 2 1 2 1 5 4 2 1 19 24 12 10 10 2 2 2 3 12 5 6 5 1 4 2 2 5 3 5 6 3 1 2 1 2 2 3 3 3 3 3 2 1 1 2 3 18 18 18 16 16 1 1 1 15 15 15 4 3 3 3 4 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 // SPDX-License-Identifier: GPL-2.0-only /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. */ /* Implementation notes: * * - There are two kinds of sockets: those created by user action (such as * calling socket(2)) and those created by incoming connection request packets. * * - There are two "global" tables, one for bound sockets (sockets that have * specified an address that they are responsible for) and one for connected * sockets (sockets that have established a connection with another socket). * These tables are "global" in that all sockets on the system are placed * within them. - Note, though, that the bound table contains an extra entry * for a list of unbound sockets and SOCK_DGRAM sockets will always remain in * that list. The bound table is used solely for lookup of sockets when packets * are received and that's not necessary for SOCK_DGRAM sockets since we create * a datagram handle for each and need not perform a lookup. Keeping SOCK_DGRAM * sockets out of the bound hash buckets will reduce the chance of collisions * when looking for SOCK_STREAM sockets and prevents us from having to check the * socket type in the hash table lookups. * * - Sockets created by user action will either be "client" sockets that * initiate a connection or "server" sockets that listen for connections; we do * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. * When future packets are received for the address the listener socket is * bound to, we check if the source of the packet is from one that has an * existing pending connection. If it does, we process the packet for the * pending socket. When that socket reaches the connected state, it is removed * from the listener socket's pending list and enqueued in the listener * socket's accept queue. Callers of accept(2) will accept connected sockets * from the listener socket's accept queue. If the socket cannot be accepted * for some reason then it is marked rejected. Once the connection is * accepted, it is owned by the user process and the responsibility for cleanup * falls with that user process. * * - It is possible that these pending sockets will never reach the connected * state; in fact, we may never receive another packet after the connection * request. Because of this, we must schedule a cleanup function to run in the * future, after some amount of time passes where a connection should have been * established. This function ensures that the socket is off all lists so it * cannot be retrieved, then drops all references to the socket so it is cleaned * up (sock_put() -> sk_free() -> our sk_destruct implementation). Note this * function will also cleanup rejected sockets, those that reach the connected * state but leave it before they have been accepted. * * - Lock ordering for pending or accept queue sockets is: * * lock_sock(listener); * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); * * Using explicit nested locking keeps lockdep happy since normally only one * lock of a given class may be taken at a time. * * - Sockets created by user action will be cleaned up when the user process * calls close(2), causing our release implementation to be called. Our release * implementation will perform some cleanup then drop the last reference so our * sk_destruct implementation is invoked. Our sk_destruct implementation will * perform additional cleanup that's common for both types of sockets. * * - A socket's reference count is what ensures that the structure won't be * freed. Each entry in a list (such as the "global" bound and connected tables * and the listener socket's pending list and connected queue) ensures a * reference. When we defer work until process context and pass a socket as our * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. * * - sk->sk_state uses the TCP state constants because they are widely used by * other address families and exposed to userspace tools like ss(8): * * TCP_CLOSE - unconnected * TCP_SYN_SENT - connecting * TCP_ESTABLISHED - connected * TCP_CLOSING - disconnecting * TCP_LISTEN - listening */ #include <linux/compat.h> #include <linux/types.h> #include <linux/bitops.h> #include <linux/cred.h> #include <linux/errqueue.h> #include <linux/init.h> #include <linux/io.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/net.h> #include <linux/poll.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/smp.h> #include <linux/socket.h> #include <linux/stddef.h> #include <linux/unistd.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/af_vsock.h> #include <uapi/linux/vm_sockets.h> #include <uapi/asm-generic/ioctls.h> static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); static void vsock_close(struct sock *sk, long timeout); /* Protocol family. */ struct proto vsock_proto = { .name = "AF_VSOCK", .owner = THIS_MODULE, .obj_size = sizeof(struct vsock_sock), .close = vsock_close, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = vsock_bpf_update_proto, #endif }; /* The default peer timeout indicates how long we will wait for a peer response * to a control message. */ #define VSOCK_DEFAULT_CONNECT_TIMEOUT (2 * HZ) #define VSOCK_DEFAULT_BUFFER_SIZE (1024 * 256) #define VSOCK_DEFAULT_BUFFER_MAX_SIZE (1024 * 256) #define VSOCK_DEFAULT_BUFFER_MIN_SIZE 128 /* Transport used for host->guest communication */ static const struct vsock_transport *transport_h2g; /* Transport used for guest->host communication */ static const struct vsock_transport *transport_g2h; /* Transport used for DGRAM communication */ static const struct vsock_transport *transport_dgram; /* Transport used for local communication */ static const struct vsock_transport *transport_local; static DEFINE_MUTEX(vsock_register_mutex); /**** UTILS ****/ /* Each bound VSocket is stored in the bind hash table and each connected * VSocket is stored in the connected hash table. * * Unbound sockets are all put on the same list attached to the end of the hash * table (vsock_unbound_sockets). Bound sockets are added to the hash table in * the bucket that their local address hashes to (vsock_bound_sockets(addr) * represents the list that addr hashes to). * * Specifically, we initialize the vsock_bind_table array to a size of * VSOCK_HASH_SIZE + 1 so that vsock_bind_table[0] through * vsock_bind_table[VSOCK_HASH_SIZE - 1] are for bound sockets and * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this. */ #define MAX_PORT_RETRIES 24 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) #define vsock_bound_sockets(addr) (&vsock_bind_table[VSOCK_HASH(addr)]) #define vsock_unbound_sockets (&vsock_bind_table[VSOCK_HASH_SIZE]) /* XXX This can probably be implemented in a better way. */ #define VSOCK_CONN_HASH(src, dst) \ (((src)->svm_cid ^ (dst)->svm_port) % VSOCK_HASH_SIZE) #define vsock_connected_sockets(src, dst) \ (&vsock_connected_table[VSOCK_CONN_HASH(src, dst)]) #define vsock_connected_sockets_vsk(vsk) \ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; EXPORT_SYMBOL_GPL(vsock_bind_table); struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; EXPORT_SYMBOL_GPL(vsock_connected_table); DEFINE_SPINLOCK(vsock_table_lock); EXPORT_SYMBOL_GPL(vsock_table_lock); /* Autobind this socket to the local address if necessary. */ static int vsock_auto_bind(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); struct sockaddr_vm local_addr; if (vsock_addr_bound(&vsk->local_addr)) return 0; vsock_addr_init(&local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); return __vsock_bind(sk, &local_addr); } static void vsock_init_tables(void) { int i; for (i = 0; i < ARRAY_SIZE(vsock_bind_table); i++) INIT_LIST_HEAD(&vsock_bind_table[i]); for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) INIT_LIST_HEAD(&vsock_connected_table[i]); } static void __vsock_insert_bound(struct list_head *list, struct vsock_sock *vsk) { sock_hold(&vsk->sk); list_add(&vsk->bound_table, list); } static void __vsock_insert_connected(struct list_head *list, struct vsock_sock *vsk) { sock_hold(&vsk->sk); list_add(&vsk->connected_table, list); } static void __vsock_remove_bound(struct vsock_sock *vsk) { list_del_init(&vsk->bound_table); sock_put(&vsk->sk); } static void __vsock_remove_connected(struct vsock_sock *vsk) { list_del_init(&vsk->connected_table); sock_put(&vsk->sk); } static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { if (vsock_addr_equals_addr(addr, &vsk->local_addr)) return sk_vsock(vsk); if (addr->svm_port == vsk->local_addr.svm_port && (vsk->local_addr.svm_cid == VMADDR_CID_ANY || addr->svm_cid == VMADDR_CID_ANY)) return sk_vsock(vsk); } return NULL; } static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_connected_sockets(src, dst), connected_table) { if (vsock_addr_equals_addr(src, &vsk->remote_addr) && dst->svm_port == vsk->local_addr.svm_port) { return sk_vsock(vsk); } } return NULL; } static void vsock_insert_unbound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); __vsock_insert_bound(vsock_unbound_sockets, vsk); spin_unlock_bh(&vsock_table_lock); } void vsock_insert_connected(struct vsock_sock *vsk) { struct list_head *list = vsock_connected_sockets( &vsk->remote_addr, &vsk->local_addr); spin_lock_bh(&vsock_table_lock); __vsock_insert_connected(list, vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_insert_connected); void vsock_remove_bound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); if (__vsock_in_bound_table(vsk)) __vsock_remove_bound(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_bound); void vsock_remove_connected(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); if (__vsock_in_connected_table(vsk)) __vsock_remove_connected(vsk); spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_remove_connected); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) { struct sock *sk; spin_lock_bh(&vsock_table_lock); sk = __vsock_find_bound_socket(addr); if (sk) sock_hold(sk); spin_unlock_bh(&vsock_table_lock); return sk; } EXPORT_SYMBOL_GPL(vsock_find_bound_socket); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst) { struct sock *sk; spin_lock_bh(&vsock_table_lock); sk = __vsock_find_connected_socket(src, dst); if (sk) sock_hold(sk); spin_unlock_bh(&vsock_table_lock); return sk; } EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) { /* Transport reassignment must not remove the binding. */ if (sock_flag(sk_vsock(vsk), SOCK_DEAD)) vsock_remove_bound(vsk); vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)) { int i; spin_lock_bh(&vsock_table_lock); for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { struct vsock_sock *vsk; list_for_each_entry(vsk, &vsock_connected_table[i], connected_table) { if (vsk->transport != transport) continue; fn(sk_vsock(vsk)); } } spin_unlock_bh(&vsock_table_lock); } EXPORT_SYMBOL_GPL(vsock_for_each_connected_socket); void vsock_add_pending(struct sock *listener, struct sock *pending) { struct vsock_sock *vlistener; struct vsock_sock *vpending; vlistener = vsock_sk(listener); vpending = vsock_sk(pending); sock_hold(pending); sock_hold(listener); list_add_tail(&vpending->pending_links, &vlistener->pending_links); } EXPORT_SYMBOL_GPL(vsock_add_pending); void vsock_remove_pending(struct sock *listener, struct sock *pending) { struct vsock_sock *vpending = vsock_sk(pending); list_del_init(&vpending->pending_links); sock_put(listener); sock_put(pending); } EXPORT_SYMBOL_GPL(vsock_remove_pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected) { struct vsock_sock *vlistener; struct vsock_sock *vconnected; vlistener = vsock_sk(listener); vconnected = vsock_sk(connected); sock_hold(connected); sock_hold(listener); list_add_tail(&vconnected->accept_queue, &vlistener->accept_queue); } EXPORT_SYMBOL_GPL(vsock_enqueue_accept); static bool vsock_use_local_transport(unsigned int remote_cid) { lockdep_assert_held(&vsock_register_mutex); if (!transport_local) return false; if (remote_cid == VMADDR_CID_LOCAL) return true; if (transport_g2h) { return remote_cid == transport_g2h->get_local_cid(); } else { return remote_cid == VMADDR_CID_HOST; } } static void vsock_deassign_transport(struct vsock_sock *vsk) { if (!vsk->transport) return; vsk->transport->destruct(vsk); module_put(vsk->transport->module); vsk->transport = NULL; } /* Assign a transport to a socket and call the .init transport callback. * * Note: for connection oriented socket this must be called when vsk->remote_addr * is set (e.g. during the connect() or when a connection request on a listener * socket is received). * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if * g2h is not loaded, will use local transport; * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; * - remote CID > VMADDR_CID_HOST will use host->guest transport; */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) { const struct vsock_transport *new_transport; struct sock *sk = sk_vsock(vsk); unsigned int remote_cid = vsk->remote_addr.svm_cid; __u8 remote_flags; int ret; /* If the packet is coming with the source and destination CIDs higher * than VMADDR_CID_HOST, then a vsock channel where all the packets are * forwarded to the host should be established. Then the host will * need to forward the packets to the guest. * * The flag is set on the (listen) receive path (psk is not NULL). On * the connect path the flag can be set by the user space application. */ if (psk && vsk->local_addr.svm_cid > VMADDR_CID_HOST && vsk->remote_addr.svm_cid > VMADDR_CID_HOST) vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; remote_flags = vsk->remote_addr.svm_flags; mutex_lock(&vsock_register_mutex); switch (sk->sk_type) { case SOCK_DGRAM: new_transport = transport_dgram; break; case SOCK_STREAM: case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || (remote_flags & VMADDR_FLAG_TO_HOST)) new_transport = transport_g2h; else new_transport = transport_h2g; break; default: ret = -ESOCKTNOSUPPORT; goto err; } if (vsk->transport && vsk->transport == new_transport) { ret = 0; goto err; } /* We increase the module refcnt to prevent the transport unloading * while there are open sockets assigned to it. */ if (!new_transport || !try_module_get(new_transport->module)) { ret = -ENODEV; goto err; } /* It's safe to release the mutex after a successful try_module_get(). * Whichever transport `new_transport` points at, it won't go away until * the last module_put() below or in vsock_deassign_transport(). */ mutex_unlock(&vsock_register_mutex); if (vsk->transport) { /* transport->release() must be called with sock lock acquired. * This path can only be taken during vsock_connect(), where we * have already held the sock lock. In the other cases, this * function is called on a new socket which is not assigned to * any transport. */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); /* transport's release() and destruct() can touch some socket * state, since we are reassigning the socket to a new transport * during vsock_connect(), let's reset these fields to have a * clean state. */ sock_reset_flag(sk, SOCK_DONE); sk->sk_state = TCP_CLOSE; vsk->peer_shutdown = 0; } if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || !new_transport->seqpacket_allow(remote_cid)) { module_put(new_transport->module); return -ESOCKTNOSUPPORT; } } ret = new_transport->init(vsk, psk); if (ret) { module_put(new_transport->module); return ret; } vsk->transport = new_transport; return 0; err: mutex_unlock(&vsock_register_mutex); return ret; } EXPORT_SYMBOL_GPL(vsock_assign_transport); /* * Provide safe access to static transport_{h2g,g2h,dgram,local} callbacks. * Otherwise we may race with module removal. Do not use on `vsk->transport`. */ static u32 vsock_registered_transport_cid(const struct vsock_transport **transport) { u32 cid = VMADDR_CID_ANY; mutex_lock(&vsock_register_mutex); if (*transport) cid = (*transport)->get_local_cid(); mutex_unlock(&vsock_register_mutex); return cid; } bool vsock_find_cid(unsigned int cid) { if (cid == vsock_registered_transport_cid(&transport_g2h)) return true; if (transport_h2g && cid == VMADDR_CID_HOST) return true; if (transport_local && cid == VMADDR_CID_LOCAL) return true; return false; } EXPORT_SYMBOL_GPL(vsock_find_cid); static struct sock *vsock_dequeue_accept(struct sock *listener) { struct vsock_sock *vlistener; struct vsock_sock *vconnected; vlistener = vsock_sk(listener); if (list_empty(&vlistener->accept_queue)) return NULL; vconnected = list_entry(vlistener->accept_queue.next, struct vsock_sock, accept_queue); list_del_init(&vconnected->accept_queue); sock_put(listener); /* The caller will need a reference on the connected socket so we let * it call sock_put(). */ return sk_vsock(vconnected); } static bool vsock_is_accept_queue_empty(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); return list_empty(&vsk->accept_queue); } static bool vsock_is_pending(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); return !list_empty(&vsk->pending_links); } static int vsock_send_shutdown(struct sock *sk, int mode) { struct vsock_sock *vsk = vsock_sk(sk); if (!vsk->transport) return -ENODEV; return vsk->transport->shutdown(vsk, mode); } static void vsock_pending_work(struct work_struct *work) { struct sock *sk; struct sock *listener; struct vsock_sock *vsk; bool cleanup; vsk = container_of(work, struct vsock_sock, pending_work.work); sk = sk_vsock(vsk); listener = vsk->listener; cleanup = true; lock_sock(listener); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); sk_acceptq_removed(listener); } else if (!vsk->rejected) { /* We are not on the pending list and accept() did not reject * us, so we must have been accepted by our user process. We * just need to drop our references to the sockets and be on * our way. */ cleanup = false; goto out; } /* We need to remove ourself from the global connected sockets list so * incoming packets can't find this socket, and to reduce the reference * count. */ vsock_remove_connected(vsk); sk->sk_state = TCP_CLOSE; out: release_sock(sk); release_sock(listener); if (cleanup) sock_put(sk); sock_put(sk); sock_put(listener); } /**** SOCKET OPERATIONS ****/ static int __vsock_bind_connectible(struct vsock_sock *vsk, struct sockaddr_vm *addr) { static u32 port; struct sockaddr_vm new_addr; if (!port) port = get_random_u32_above(LAST_RESERVED_PORT); vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); if (addr->svm_port == VMADDR_PORT_ANY) { bool found = false; unsigned int i; for (i = 0; i < MAX_PORT_RETRIES; i++) { if (port == VMADDR_PORT_ANY || port <= LAST_RESERVED_PORT) port = LAST_RESERVED_PORT + 1; new_addr.svm_port = port++; if (!__vsock_find_bound_socket(&new_addr)) { found = true; break; } } if (!found) return -EADDRNOTAVAIL; } else { /* If port is in reserved range, ensure caller * has necessary privileges. */ if (addr->svm_port <= LAST_RESERVED_PORT && !capable(CAP_NET_BIND_SERVICE)) { return -EACCES; } if (__vsock_find_bound_socket(&new_addr)) return -EADDRINUSE; } vsock_addr_init(&vsk->local_addr, new_addr.svm_cid, new_addr.svm_port); /* Remove connection oriented sockets from the unbound list and add them * to the hash table for easy lookup by its address. The unbound list * is simply an extra entry at the end of the hash table, a trick used * by AF_UNIX. */ __vsock_remove_bound(vsk); __vsock_insert_bound(vsock_bound_sockets(&vsk->local_addr), vsk); return 0; } static int __vsock_bind_dgram(struct vsock_sock *vsk, struct sockaddr_vm *addr) { return vsk->transport->dgram_bind(vsk, addr); } static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr) { struct vsock_sock *vsk = vsock_sk(sk); int retval; /* First ensure this socket isn't already bound. */ if (vsock_addr_bound(&vsk->local_addr)) return -EINVAL; /* Now bind to the provided address or select appropriate values if * none are provided (VMADDR_CID_ANY and VMADDR_PORT_ANY). Note that * like AF_INET prevents binding to a non-local IP address (in most * cases), we only allow binding to a local CID. */ if (addr->svm_cid != VMADDR_CID_ANY && !vsock_find_cid(addr->svm_cid)) return -EADDRNOTAVAIL; switch (sk->sk_socket->type) { case SOCK_STREAM: case SOCK_SEQPACKET: spin_lock_bh(&vsock_table_lock); retval = __vsock_bind_connectible(vsk, addr); spin_unlock_bh(&vsock_table_lock); break; case SOCK_DGRAM: retval = __vsock_bind_dgram(vsk, addr); break; default: retval = -EINVAL; break; } return retval; } static void vsock_connect_timeout(struct work_struct *work); static struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, unsigned short type, int kern) { struct sock *sk; struct vsock_sock *psk; struct vsock_sock *vsk; sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern); if (!sk) return NULL; sock_init_data(sock, sk); /* sk->sk_type is normally set in sock_init_data, but only if sock is * non-NULL. We make sure that our sockets always have a type by * setting it here if needed. */ if (!sock) sk->sk_type = type; vsk = vsock_sk(sk); vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); sk->sk_destruct = vsock_sk_destruct; sk->sk_backlog_rcv = vsock_queue_rcv_skb; sock_reset_flag(sk, SOCK_DONE); INIT_LIST_HEAD(&vsk->bound_table); INIT_LIST_HEAD(&vsk->connected_table); vsk->listener = NULL; INIT_LIST_HEAD(&vsk->pending_links); INIT_LIST_HEAD(&vsk->accept_queue); vsk->rejected = false; vsk->sent_request = false; vsk->ignore_connecting_rst = false; vsk->peer_shutdown = 0; INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); psk = parent ? vsock_sk(parent) : NULL; if (parent) { vsk->trusted = psk->trusted; vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; vsk->buffer_size = psk->buffer_size; vsk->buffer_min_size = psk->buffer_min_size; vsk->buffer_max_size = psk->buffer_max_size; security_sk_clone(parent, sk); } else { vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; vsk->buffer_size = VSOCK_DEFAULT_BUFFER_SIZE; vsk->buffer_min_size = VSOCK_DEFAULT_BUFFER_MIN_SIZE; vsk->buffer_max_size = VSOCK_DEFAULT_BUFFER_MAX_SIZE; } return sk; } static bool sock_type_connectible(u16 type) { return (type == SOCK_STREAM) || (type == SOCK_SEQPACKET); } static void __vsock_release(struct sock *sk, int level) { struct vsock_sock *vsk; struct sock *pending; vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ /* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking * detected". When "level" is 0, lock_sock_nested(sk, level) * is the same as lock_sock(sk). */ lock_sock_nested(sk, level); /* Indicate to vsock_remove_sock() that the socket is being released and * can be removed from the bound_table. Unlike transport reassignment * case, where the socket must remain bound despite vsock_remove_sock() * being called from the transport release() callback. */ sock_set_flag(sk, SOCK_DEAD); if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) vsock_remove_sock(vsk); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; skb_queue_purge(&sk->sk_receive_queue); /* Clean up any sockets that never were accepted. */ while ((pending = vsock_dequeue_accept(sk)) != NULL) { __vsock_release(pending, SINGLE_DEPTH_NESTING); sock_put(pending); } release_sock(sk); sock_put(sk); } static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); /* Flush MSG_ZEROCOPY leftovers. */ __skb_queue_purge(&sk->sk_error_queue); vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and * possibly register the address family with the kernel. */ vsock_addr_init(&vsk->local_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); put_cred(vsk->owner); } static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int err; err = sock_queue_rcv_skb(sk, skb); if (err) kfree_skb(skb); return err; } struct sock *vsock_create_connected(struct sock *parent) { return __vsock_create(sock_net(parent), NULL, parent, GFP_KERNEL, parent->sk_type, 0); } EXPORT_SYMBOL_GPL(vsock_create_connected); s64 vsock_stream_has_data(struct vsock_sock *vsk) { if (WARN_ON(!vsk->transport)) return 0; return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); s64 vsock_connectible_has_data(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); if (WARN_ON(!vsk->transport)) return 0; if (sk->sk_type == SOCK_SEQPACKET) return vsk->transport->seqpacket_has_data(vsk); else return vsock_stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_connectible_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { if (WARN_ON(!vsk->transport)) return 0; return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); void vsock_data_ready(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); if (vsock_stream_has_data(vsk) >= sk->sk_rcvlowat || sock_flag(sk, SOCK_DONE)) sk->sk_data_ready(sk); } EXPORT_SYMBOL_GPL(vsock_data_ready); /* Dummy callback required by sockmap. * See unconditional call of saved_close() in sock_map_close(). */ static void vsock_close(struct sock *sk, long timeout) { } static int vsock_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; sk->sk_prot->close(sk, 0); __vsock_release(sk, 0); sock->sk = NULL; sock->state = SS_FREE; return 0; } static int vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { int err; struct sock *sk; struct sockaddr_vm *vm_addr; sk = sock->sk; if (vsock_addr_cast(addr, addr_len, &vm_addr) != 0) return -EINVAL; lock_sock(sk); err = __vsock_bind(sk, vm_addr); release_sock(sk); return err; } static int vsock_getname(struct socket *sock, struct sockaddr *addr, int peer) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *vm_addr; sk = sock->sk; vsk = vsock_sk(sk); err = 0; lock_sock(sk); if (peer) { if (sock->state != SS_CONNECTED) { err = -ENOTCONN; goto out; } vm_addr = &vsk->remote_addr; } else { vm_addr = &vsk->local_addr; } BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage)); memcpy(addr, vm_addr, sizeof(*vm_addr)); err = sizeof(*vm_addr); out: release_sock(sk); return err; } void vsock_linger(struct sock *sk) { DEFINE_WAIT_FUNC(wait, woken_wake_function); ssize_t (*unsent)(struct vsock_sock *vsk); struct vsock_sock *vsk = vsock_sk(sk); long timeout; if (!sock_flag(sk, SOCK_LINGER)) return; timeout = sk->sk_lingertime; if (!timeout) return; /* Transports must implement `unsent_bytes` if they want to support * SOCK_LINGER through `vsock_linger()` since we use it to check when * the socket can be closed. */ unsent = vsk->transport->unsent_bytes; if (!unsent) return; add_wait_queue(sk_sleep(sk), &wait); do { if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) break; } while (!signal_pending(current) && timeout); remove_wait_queue(sk_sleep(sk), &wait); } EXPORT_SYMBOL_GPL(vsock_linger); static int vsock_shutdown(struct socket *sock, int mode) { int err; struct sock *sk; /* User level uses SHUT_RD (0) and SHUT_WR (1), but the kernel uses * RCV_SHUTDOWN (1) and SEND_SHUTDOWN (2), so we must increment mode * here like the other address families do. Note also that the * increment makes SHUT_RDWR (2) into RCV_SHUTDOWN | SEND_SHUTDOWN (3), * which is what we want. */ mode++; if ((mode & ~SHUTDOWN_MASK) || !mode) return -EINVAL; /* If this is a connection oriented socket and it is not connected then * bail out immediately. If it is a DGRAM socket then we must first * kick the socket so that it wakes up from any sleeping calls, for * example recv(), and then afterwards return the error. */ sk = sock->sk; lock_sock(sk); if (sock->state == SS_UNCONNECTED) { err = -ENOTCONN; if (sock_type_connectible(sk->sk_type)) goto out; } else { sock->state = SS_DISCONNECTING; err = 0; } /* Receive and send shutdowns are treated alike. */ mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN); if (mode) { sk->sk_shutdown |= mode; sk->sk_state_change(sk); if (sock_type_connectible(sk->sk_type)) { sock_reset_flag(sk, SOCK_DONE); vsock_send_shutdown(sk, mode); } } out: release_sock(sk); return err; } static __poll_t vsock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk; __poll_t mask; struct vsock_sock *vsk; sk = sock->sk; vsk = vsock_sk(sk); poll_wait(file, sk_sleep(sk), wait); mask = 0; if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) /* Signify that there has been an error on this socket. */ mask |= EPOLLERR; /* INET sockets treat local write shutdown and peer write shutdown as a * case of EPOLLHUP set. */ if ((sk->sk_shutdown == SHUTDOWN_MASK) || ((sk->sk_shutdown & SEND_SHUTDOWN) && (vsk->peer_shutdown & SEND_SHUTDOWN))) { mask |= EPOLLHUP; } if (sk->sk_shutdown & RCV_SHUTDOWN || vsk->peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLRDHUP; } if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; if (sock->type == SOCK_DGRAM) { /* For datagram sockets we can read if there is something in * the queue and write as long as the socket isn't shutdown for * sending. */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || (sk->sk_shutdown & RCV_SHUTDOWN)) { mask |= EPOLLIN | EPOLLRDNORM; } if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; lock_sock(sk); transport = vsk->transport; /* Listening sockets that have connections in their accept * queue can be read. */ if (sk->sk_state == TCP_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= EPOLLIN | EPOLLRDNORM; /* If there is something in the queue then we can read. */ if (transport && transport->stream_is_active(vsk) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { bool data_ready_now = false; int target = sock_rcvlowat(sk, 0, INT_MAX); int ret = transport->notify_poll_in( vsk, target, &data_ready_now); if (ret < 0) { mask |= EPOLLERR; } else { if (data_ready_now) mask |= EPOLLIN | EPOLLRDNORM; } } /* Sockets whose connections have been closed, reset, or * terminated should also be considered read, and we check the * shutdown flag for that. */ if (sk->sk_shutdown & RCV_SHUTDOWN || vsk->peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLIN | EPOLLRDNORM; } /* Connected sockets that can produce data can be written. */ if (transport && sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( vsk, 1, &space_avail_now); if (ret < 0) { mask |= EPOLLERR; } else { if (space_avail_now) /* Remove EPOLLWRBAND since INET * sockets are not setting it. */ mask |= EPOLLOUT | EPOLLWRNORM; } } } /* Simulate INET socket poll behaviors, which sets * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= EPOLLOUT | EPOLLWRNORM; } release_sock(sk); } return mask; } static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); if (WARN_ON_ONCE(!vsk->transport)) return -ENODEV; return vsk->transport->read_skb(vsk, read_actor); } static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; const struct vsock_transport *transport; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; /* For now, MSG_DONTWAIT is always assumed... */ err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); transport = vsk->transport; err = vsock_auto_bind(vsk); if (err) goto out; /* If the provided message contains an address, use that. Otherwise * fall back on the socket's remote handle (if it has been connected). */ if (msg->msg_name && vsock_addr_cast(msg->msg_name, msg->msg_namelen, &remote_addr) == 0) { /* Ensure this address is of the right type and is a valid * destination. */ if (remote_addr->svm_cid == VMADDR_CID_ANY) remote_addr->svm_cid = transport->get_local_cid(); if (!vsock_addr_bound(remote_addr)) { err = -EINVAL; goto out; } } else if (sock->state == SS_CONNECTED) { remote_addr = &vsk->remote_addr; if (remote_addr->svm_cid == VMADDR_CID_ANY) remote_addr->svm_cid = transport->get_local_cid(); /* XXX Should connect() or this function ensure remote_addr is * bound? */ if (!vsock_addr_bound(&vsk->remote_addr)) { err = -EINVAL; goto out; } } else { err = -EINVAL; goto out; } if (!transport->dgram_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; } err = transport->dgram_enqueue(vsk, remote_addr, msg, len); out: release_sock(sk); return err; } static int vsock_dgram_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags) { int err; struct sock *sk; struct vsock_sock *vsk; struct sockaddr_vm *remote_addr; sk = sock->sk; vsk = vsock_sk(sk); err = vsock_addr_cast(addr, addr_len, &remote_addr); if (err == -EAFNOSUPPORT && remote_addr->svm_family == AF_UNSPEC) { lock_sock(sk); vsock_addr_init(&vsk->remote_addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); sock->state = SS_UNCONNECTED; release_sock(sk); return 0; } else if (err != 0) return -EINVAL; lock_sock(sk); err = vsock_auto_bind(vsk); if (err) goto out; if (!vsk->transport->dgram_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; } memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); sock->state = SS_CONNECTED; /* sock map disallows redirection of non-TCP sockets with sk_state != * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams. * * This doesn't seem to be abnormal state for datagram sockets, as the * same approach can be see in other datagram socket types as well * (such as unix sockets). */ sk->sk_state = TCP_ESTABLISHED; out: release_sock(sk); return err; } int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct vsock_sock *vsk = vsock_sk(sk); return vsk->transport->dgram_dequeue(vsk, msg, len, flags); } int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot; prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags, NULL); #endif return __vsock_dgram_recvmsg(sock, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, int __user *arg) { struct sock *sk = sock->sk; struct vsock_sock *vsk; int ret; vsk = vsock_sk(sk); switch (cmd) { case SIOCINQ: { ssize_t n_bytes; if (!vsk->transport) { ret = -EOPNOTSUPP; break; } if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { ret = -EINVAL; break; } n_bytes = vsock_stream_has_data(vsk); if (n_bytes < 0) { ret = n_bytes; break; } ret = put_user(n_bytes, arg); break; } case SIOCOUTQ: { ssize_t n_bytes; if (!vsk->transport || !vsk->transport->unsent_bytes) { ret = -EOPNOTSUPP; break; } if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { ret = -EINVAL; break; } n_bytes = vsk->transport->unsent_bytes(vsk); if (n_bytes < 0) { ret = n_bytes; break; } ret = put_user(n_bytes, arg); break; } default: ret = -ENOIOCTLCMD; } return ret; } static int vsock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int ret; lock_sock(sock->sk); ret = vsock_do_ioctl(sock, cmd, (int __user *)arg); release_sock(sock->sk); return ret; } static const struct proto_ops vsock_dgram_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, .sendmsg = vsock_dgram_sendmsg, .recvmsg = vsock_dgram_recvmsg, .mmap = sock_no_mmap, .read_skb = vsock_read_skb, }; static int vsock_transport_cancel_pkt(struct vsock_sock *vsk) { const struct vsock_transport *transport = vsk->transport; if (!transport || !transport->cancel_pkt) return -EOPNOTSUPP; return transport->cancel_pkt(vsk); } static void vsock_connect_timeout(struct work_struct *work) { struct sock *sk; struct vsock_sock *vsk; vsk = container_of(work, struct vsock_sock, connect_work.work); sk = sk_vsock(vsk); lock_sock(sk); if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { sk->sk_state = TCP_CLOSE; sk->sk_socket->state = SS_UNCONNECTED; sk->sk_err = ETIMEDOUT; sk_error_report(sk); vsock_transport_cancel_pkt(vsk); } release_sock(sk); sock_put(sk); } static int vsock_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags) { int err; struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; struct sockaddr_vm *remote_addr; long timeout; DEFINE_WAIT(wait); err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); /* XXX AF_UNSPEC should make us disconnect like AF_INET. */ switch (sock->state) { case SS_CONNECTED: err = -EISCONN; goto out; case SS_DISCONNECTING: err = -EINVAL; goto out; case SS_CONNECTING: /* This continues on so we can move sock into the SS_CONNECTED * state once the connection has completed (at which point err * will be set to zero also). Otherwise, we will either wait * for the connection or return -EALREADY should this be a * non-blocking call. */ err = -EALREADY; if (flags & O_NONBLOCK) goto out; break; default: if ((sk->sk_state == TCP_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; } /* Set the remote address that we are connecting to. */ memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr)); err = vsock_assign_transport(vsk, NULL); if (err) goto out; transport = vsk->transport; /* The hypervisor and well-known contexts do not have socket * endpoints. */ if (!transport || !transport->stream_allow(remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; } if (vsock_msgzerocopy_allow(transport)) { set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); } else if (sock_flag(sk, SOCK_ZEROCOPY)) { /* If this option was set before 'connect()', * when transport was unknown, check that this * feature is supported here. */ err = -EOPNOTSUPP; goto out; } err = vsock_auto_bind(vsk); if (err) goto out; sk->sk_state = TCP_SYN_SENT; err = transport->connect(vsk); if (err < 0) goto out; /* sk_err might have been set as a result of an earlier * (failed) connect attempt. */ sk->sk_err = 0; /* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect. */ sock->state = SS_CONNECTING; err = -EINPROGRESS; } /* The receive path will handle all communication until we are able to * enter the connected state. Here we wait for the connection to be * completed or a notification of an error. */ timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* If the socket is already closing or it is in an error state, there * is no point in waiting. */ while (sk->sk_state != TCP_ESTABLISHED && sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection * attempt, in case the peer doesn't respond in a * timely manner. We hold on to the socket until the * timeout fires. */ sock_hold(sk); /* If the timeout function is already scheduled, * reschedule it, then ungrab the socket refcount to * keep it balanced. */ if (mod_delayed_work(system_percpu_wq, &vsk->connect_work, timeout)) sock_put(sk); /* Skip ahead to preserve error code set above. */ goto out_wait; } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); vsock_remove_connected(vsk); goto out_wait; } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { err = -ETIMEDOUT; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } if (sk->sk_err) { err = -sk->sk_err; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; } else { err = 0; } out_wait: finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; } static int vsock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *listener; int err; struct sock *connected; struct vsock_sock *vconnected; long timeout; DEFINE_WAIT(wait); err = 0; listener = sock->sk; lock_sock(listener); if (!sock_type_connectible(sock->type)) { err = -EOPNOTSUPP; goto out; } if (listener->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } /* Wait for children sockets to appear; these are the new sockets * created upon connection establishment. */ timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK); prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); while ((connected = vsock_dequeue_accept(listener)) == NULL && listener->sk_err == 0) { release_sock(listener); timeout = schedule_timeout(timeout); finish_wait(sk_sleep(listener), &wait); lock_sock(listener); if (signal_pending(current)) { err = sock_intr_errno(timeout); goto out; } else if (timeout == 0) { err = -EAGAIN; goto out; } prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(listener), &wait); if (listener->sk_err) err = -listener->sk_err; if (connected) { sk_acceptq_removed(listener); lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); /* If the listener socket has received an error, then we should * reject this socket and return. Note that we simply mark the * socket rejected, drop our reference, and let the cleanup * function handle the cleanup; the fact that we found it in * the listener's accept queue guarantees that the cleanup * function hasn't run yet. */ if (err) { vconnected->rejected = true; } else { newsock->state = SS_CONNECTED; sock_graft(connected, newsock); if (vsock_msgzerocopy_allow(vconnected->transport)) set_bit(SOCK_SUPPORT_ZC, &connected->sk_socket->flags); } release_sock(connected); sock_put(connected); } out: release_sock(listener); return err; } static int vsock_listen(struct socket *sock, int backlog) { int err; struct sock *sk; struct vsock_sock *vsk; sk = sock->sk; lock_sock(sk); if (!sock_type_connectible(sk->sk_type)) { err = -EOPNOTSUPP; goto out; } if (sock->state != SS_UNCONNECTED) { err = -EINVAL; goto out; } vsk = vsock_sk(sk); if (!vsock_addr_bound(&vsk->local_addr)) { err = -EINVAL; goto out; } sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; err = 0; out: release_sock(sk); return err; } static void vsock_update_buffer_size(struct vsock_sock *vsk, const struct vsock_transport *transport, u64 val) { if (val > vsk->buffer_max_size) val = vsk->buffer_max_size; if (val < vsk->buffer_min_size) val = vsk->buffer_min_size; if (val != vsk->buffer_size && transport && transport->notify_buffer_size) transport->notify_buffer_size(vsk, &val); vsk->buffer_size = val; } static int vsock_connectible_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; u64 val; if (level != AF_VSOCK && level != SOL_SOCKET) return -ENOPROTOOPT; #define COPY_IN(_v) \ do { \ if (optlen < sizeof(_v)) { \ err = -EINVAL; \ goto exit; \ } \ if (copy_from_sockptr(&_v, optval, sizeof(_v)) != 0) { \ err = -EFAULT; \ goto exit; \ } \ } while (0) err = 0; sk = sock->sk; vsk = vsock_sk(sk); lock_sock(sk); transport = vsk->transport; if (level == SOL_SOCKET) { int zerocopy; if (optname != SO_ZEROCOPY) { release_sock(sk); return sock_setsockopt(sock, level, optname, optval, optlen); } /* Use 'int' type here, because variable to * set this option usually has this type. */ COPY_IN(zerocopy); if (zerocopy < 0 || zerocopy > 1) { err = -EINVAL; goto exit; } if (transport && !vsock_msgzerocopy_allow(transport)) { err = -EOPNOTSUPP; goto exit; } sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy); goto exit; } switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: COPY_IN(val); vsock_update_buffer_size(vsk, transport, val); break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: COPY_IN(val); vsk->buffer_max_size = val; vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: COPY_IN(val); vsk->buffer_min_size = val; vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { struct __kernel_sock_timeval tv; err = sock_copy_user_timeval(&tv, optval, optlen, optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); if (err) break; if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { vsk->connect_timeout = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); if (vsk->connect_timeout == 0) vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; } else { err = -ERANGE; } break; } default: err = -ENOPROTOOPT; break; } #undef COPY_IN exit: release_sock(sk); return err; } static int vsock_connectible_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct vsock_sock *vsk = vsock_sk(sk); union { u64 val64; struct old_timeval32 tm32; struct __kernel_old_timeval tm; struct __kernel_sock_timeval stm; } v; int lv = sizeof(v.val64); int len; if (level != AF_VSOCK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; memset(&v, 0, sizeof(v)); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: v.val64 = vsk->buffer_size; break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: v.val64 = vsk->buffer_max_size; break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: v.val64 = vsk->buffer_min_size; break; case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: lv = sock_get_timeout(vsk->connect_timeout, &v, optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); break; default: return -ENOPROTOOPT; } if (len < lv) return -EINVAL; if (len > lv) len = lv; if (copy_to_user(optval, &v, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; ssize_t total_written; long timeout; int err; struct vsock_transport_send_notify_data send_data; DEFINE_WAIT_FUNC(wait, woken_wake_function); sk = sock->sk; vsk = vsock_sk(sk); total_written = 0; err = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); transport = vsk->transport; /* Callers should not provide a destination with connection oriented * sockets. */ if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; } /* Send data only if both sides are not shutdown in the direction. */ if (sk->sk_shutdown & SEND_SHUTDOWN || vsk->peer_shutdown & RCV_SHUTDOWN) { err = -EPIPE; goto out; } if (!transport || sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; } if (!vsock_addr_bound(&vsk->remote_addr)) { err = -EDESTADDRREQ; goto out; } if (msg->msg_flags & MSG_ZEROCOPY && !vsock_msgzerocopy_allow(transport)) { err = -EOPNOTSUPP; goto out; } /* Wait for room in the produce queue to enqueue our user's data. */ timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = transport->notify_send_init(vsk, &send_data); if (err < 0) goto out; while (total_written < len) { ssize_t written; add_wait_queue(sk_sleep(sk), &wait); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && !(vsk->peer_shutdown & RCV_SHUTDOWN)) { /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); if (err < 0) { remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } release_sock(sk); timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } else if (timeout == 0) { err = -EAGAIN; remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } } remove_wait_queue(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after * sleeping. */ if (sk->sk_err) { err = -sk->sk_err; goto out_err; } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || (vsk->peer_shutdown & RCV_SHUTDOWN)) { err = -EPIPE; goto out_err; } err = transport->notify_send_pre_enqueue(vsk, &send_data); if (err < 0) goto out_err; /* Note that enqueue will only write as many bytes as are free * in the produce queue, so we don't need to ensure len is * smaller than the queue size. It is the caller's * responsibility to check how many bytes we were able to send. */ if (sk->sk_type == SOCK_SEQPACKET) { written = transport->seqpacket_enqueue(vsk, msg, len - total_written); } else { written = transport->stream_enqueue(vsk, msg, len - total_written); } if (written < 0) { err = written; goto out_err; } total_written += written; err = transport->notify_send_post_enqueue( vsk, written, &send_data); if (err < 0) goto out_err; } out_err: if (total_written > 0) { /* Return number of written bytes only if: * 1) SOCK_STREAM socket. * 2) SOCK_SEQPACKET socket when whole buffer is sent. */ if (sk->sk_type == SOCK_STREAM || total_written == len) err = total_written; } out: if (sk->sk_type == SOCK_STREAM) err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); return err; } static int vsock_connectible_wait_data(struct sock *sk, struct wait_queue_entry *wait, long timeout, struct vsock_transport_recv_notify_data *recv_data, size_t target) { const struct vsock_transport *transport; struct vsock_sock *vsk; s64 data; int err; vsk = vsock_sk(sk); err = 0; transport = vsk->transport; while (1) { prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE); data = vsock_connectible_has_data(vsk); if (data != 0) break; if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) || (vsk->peer_shutdown & SEND_SHUTDOWN)) { break; } /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; break; } if (recv_data) { err = transport->notify_recv_pre_block(vsk, target, recv_data); if (err < 0) break; } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); break; } else if (timeout == 0) { err = -EAGAIN; break; } } finish_wait(sk_sleep(sk), wait); if (err) return err; /* Internal transport error when checking for available * data. XXX This should be changed to a connection * reset in a later change. */ if (data < 0) return -ENOMEM; return data; } static int __vsock_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct vsock_transport_recv_notify_data recv_data; const struct vsock_transport *transport; struct vsock_sock *vsk; ssize_t copied; size_t target; long timeout; int err; DEFINE_WAIT(wait); vsk = vsock_sk(sk); transport = vsk->transport; /* We must not copy less than target bytes into the user's buffer * before returning successfully, so we wait for the consume queue to * have that much data to consume before dequeueing. Note that this * makes it impossible to handle cases where target is greater than the * queue size. */ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (target >= transport->stream_rcvhiwat(vsk)) { err = -ENOMEM; goto out; } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); copied = 0; err = transport->notify_recv_init(vsk, target, &recv_data); if (err < 0) goto out; while (1) { ssize_t read; err = vsock_connectible_wait_data(sk, &wait, timeout, &recv_data, target); if (err <= 0) break; err = transport->notify_recv_pre_dequeue(vsk, target, &recv_data); if (err < 0) break; read = transport->stream_dequeue(vsk, msg, len - copied, flags); if (read < 0) { err = read; break; } copied += read; err = transport->notify_recv_post_dequeue(vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) goto out; if (read >= target || flags & MSG_PEEK) break; target -= read; } if (sk->sk_err) err = -sk->sk_err; else if (sk->sk_shutdown & RCV_SHUTDOWN) err = 0; if (copied > 0) err = copied; out: return err; } static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { const struct vsock_transport *transport; struct vsock_sock *vsk; ssize_t msg_len; long timeout; int err = 0; DEFINE_WAIT(wait); vsk = vsock_sk(sk); transport = vsk->transport; timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); err = vsock_connectible_wait_data(sk, &wait, timeout, NULL, 0); if (err <= 0) goto out; msg_len = transport->seqpacket_dequeue(vsk, msg, flags); if (msg_len < 0) { err = msg_len; goto out; } if (sk->sk_err) { err = -sk->sk_err; } else if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; } else { /* User sets MSG_TRUNC, so return real length of * packet. */ if (flags & MSG_TRUNC) err = msg_len; else err = len - msg_data_left(msg); /* Always set MSG_TRUNC if real length of packet is * bigger than user's buffer. */ if (msg_len > len) msg->msg_flags |= MSG_TRUNC; } out: return err; } int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk; struct vsock_sock *vsk; const struct vsock_transport *transport; int err; sk = sock->sk; if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR); vsk = vsock_sk(sk); err = 0; lock_sock(sk); transport = vsk->transport; if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occurred with the * SOCK_DONE flag. */ if (sock_flag(sk, SOCK_DONE)) err = 0; else err = -ENOTCONN; goto out; } if (flags & MSG_OOB) { err = -EOPNOTSUPP; goto out; } /* We don't check peer_shutdown flag here since peer may actually shut * down, but there can be data in the queue that a local socket can * receive. */ if (sk->sk_shutdown & RCV_SHUTDOWN) { err = 0; goto out; } /* It is valid on Linux to pass in a zero-length receive buffer. This * is not an error. We may as well bail out now. */ if (!len) { err = 0; goto out; } if (sk->sk_type == SOCK_STREAM) err = __vsock_stream_recvmsg(sk, msg, len, flags); else err = __vsock_seqpacket_recvmsg(sk, msg, len, flags); out: release_sock(sk); return err; } int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { #ifdef CONFIG_BPF_SYSCALL struct sock *sk = sock->sk; const struct proto *prot; prot = READ_ONCE(sk->sk_prot); if (prot != &vsock_proto) return prot->recvmsg(sk, msg, len, flags, NULL); #endif return __vsock_connectible_recvmsg(sock, msg, len, flags); } EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg); static int vsock_set_rcvlowat(struct sock *sk, int val) { const struct vsock_transport *transport; struct vsock_sock *vsk; vsk = vsock_sk(sk); if (val > vsk->buffer_size) return -EINVAL; transport = vsk->transport; if (transport && transport->notify_set_rcvlowat) { int err; err = transport->notify_set_rcvlowat(vsk, val); if (err) return err; } WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); return 0; } static const struct proto_ops vsock_stream_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, .getsockopt = vsock_connectible_getsockopt, .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = vsock_set_rcvlowat, .read_skb = vsock_read_skb, }; static const struct proto_ops vsock_seqpacket_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, .release = vsock_release, .bind = vsock_bind, .connect = vsock_connect, .socketpair = sock_no_socketpair, .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, .getsockopt = vsock_connectible_getsockopt, .sendmsg = vsock_connectible_sendmsg, .recvmsg = vsock_connectible_recvmsg, .mmap = sock_no_mmap, .read_skb = vsock_read_skb, }; static int vsock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct vsock_sock *vsk; struct sock *sk; int ret; if (!sock) return -EINVAL; if (protocol && protocol != PF_VSOCK) return -EPROTONOSUPPORT; switch (sock->type) { case SOCK_DGRAM: sock->ops = &vsock_dgram_ops; break; case SOCK_STREAM: sock->ops = &vsock_stream_ops; break; case SOCK_SEQPACKET: sock->ops = &vsock_seqpacket_ops; break; default: return -ESOCKTNOSUPPORT; } sock->state = SS_UNCONNECTED; sk = __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern); if (!sk) return -ENOMEM; vsk = vsock_sk(sk); if (sock->type == SOCK_DGRAM) { ret = vsock_assign_transport(vsk, NULL); if (ret < 0) { sock->sk = NULL; sock_put(sk); return ret; } } /* SOCK_DGRAM doesn't have 'setsockopt' callback set in its * proto_ops, so there is no handler for custom logic. */ if (sock_type_connectible(sock->type)) set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); vsock_insert_unbound(vsk); return 0; } static const struct net_proto_family vsock_family_ops = { .family = AF_VSOCK, .create = vsock_create, .owner = THIS_MODULE, }; static long vsock_dev_do_ioctl(struct file *filp, unsigned int cmd, void __user *ptr) { u32 __user *p = ptr; int retval = 0; u32 cid; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: /* To be compatible with the VMCI behavior, we prioritize the * guest CID instead of well-know host CID (VMADDR_CID_HOST). */ cid = vsock_registered_transport_cid(&transport_g2h); if (cid == VMADDR_CID_ANY) cid = vsock_registered_transport_cid(&transport_h2g); if (cid == VMADDR_CID_ANY) cid = vsock_registered_transport_cid(&transport_local); if (put_user(cid, p) != 0) retval = -EFAULT; break; default: retval = -ENOIOCTLCMD; } return retval; } static long vsock_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return vsock_dev_do_ioctl(filp, cmd, (void __user *)arg); } #ifdef CONFIG_COMPAT static long vsock_dev_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return vsock_dev_do_ioctl(filp, cmd, compat_ptr(arg)); } #endif static const struct file_operations vsock_device_ops = { .owner = THIS_MODULE, .unlocked_ioctl = vsock_dev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vsock_dev_compat_ioctl, #endif .open = nonseekable_open, }; static struct miscdevice vsock_device = { .name = "vsock", .fops = &vsock_device_ops, }; static int __init vsock_init(void) { int err = 0; vsock_init_tables(); vsock_proto.owner = THIS_MODULE; vsock_device.minor = MISC_DYNAMIC_MINOR; err = misc_register(&vsock_device); if (err) { pr_err("Failed to register misc device\n"); goto err_reset_transport; } err = proto_register(&vsock_proto, 1); /* we want our slab */ if (err) { pr_err("Cannot register vsock protocol\n"); goto err_deregister_misc; } err = sock_register(&vsock_family_ops); if (err) { pr_err("could not register af_vsock (%d) address family: %d\n", AF_VSOCK, err); goto err_unregister_proto; } vsock_bpf_build_proto(); return 0; err_unregister_proto: proto_unregister(&vsock_proto); err_deregister_misc: misc_deregister(&vsock_device); err_reset_transport: return err; } static void __exit vsock_exit(void) { misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); } const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) { return vsk->transport; } EXPORT_SYMBOL_GPL(vsock_core_get_transport); int vsock_core_register(const struct vsock_transport *t, int features) { const struct vsock_transport *t_h2g, *t_g2h, *t_dgram, *t_local; int err = mutex_lock_interruptible(&vsock_register_mutex); if (err) return err; t_h2g = transport_h2g; t_g2h = transport_g2h; t_dgram = transport_dgram; t_local = transport_local; if (features & VSOCK_TRANSPORT_F_H2G) { if (t_h2g) { err = -EBUSY; goto err_busy; } t_h2g = t; } if (features & VSOCK_TRANSPORT_F_G2H) { if (t_g2h) { err = -EBUSY; goto err_busy; } t_g2h = t; } if (features & VSOCK_TRANSPORT_F_DGRAM) { if (t_dgram) { err = -EBUSY; goto err_busy; } t_dgram = t; } if (features & VSOCK_TRANSPORT_F_LOCAL) { if (t_local) { err = -EBUSY; goto err_busy; } t_local = t; } transport_h2g = t_h2g; transport_g2h = t_g2h; transport_dgram = t_dgram; transport_local = t_local; err_busy: mutex_unlock(&vsock_register_mutex); return err; } EXPORT_SYMBOL_GPL(vsock_core_register); void vsock_core_unregister(const struct vsock_transport *t) { mutex_lock(&vsock_register_mutex); if (transport_h2g == t) transport_h2g = NULL; if (transport_g2h == t) transport_g2h = NULL; if (transport_dgram == t) transport_dgram = NULL; if (transport_local == t) transport_local = NULL; mutex_unlock(&vsock_register_mutex); } EXPORT_SYMBOL_GPL(vsock_core_unregister); module_init(vsock_init); module_exit(vsock_exit); MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); MODULE_VERSION("1.0.2.0-k"); MODULE_LICENSE("GPL v2");
3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * * Module Name: hwxface - Public ACPICA hardware interfaces * * Copyright (C) 2000 - 2025, Intel Corp. * *****************************************************************************/ #define EXPORT_ACPI_INTERFACES #include <acpi/acpi.h> #include "accommon.h" #include "acnamesp.h" #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwxface") /****************************************************************************** * * FUNCTION: acpi_reset * * PARAMETERS: None * * RETURN: Status * * DESCRIPTION: Set reset register in memory or IO space. Note: Does not * support reset register in PCI config space, this must be * handled separately. * ******************************************************************************/ acpi_status acpi_reset(void) { struct acpi_generic_address *reset_reg; acpi_status status; ACPI_FUNCTION_TRACE(acpi_reset); reset_reg = &acpi_gbl_FADT.reset_register; /* Check if the reset register is supported */ if (!(acpi_gbl_FADT.flags & ACPI_FADT_RESET_REGISTER) || !reset_reg->address) { return_ACPI_STATUS(AE_NOT_EXIST); } if (reset_reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { /* * For I/O space, write directly to the OSL. This bypasses the port * validation mechanism, which may block a valid write to the reset * register. * * NOTE: * The ACPI spec requires the reset register width to be 8, so we * hardcode it here and ignore the FADT value. This maintains * compatibility with other ACPI implementations that have allowed * BIOS code with bad register width values to go unnoticed. */ status = acpi_os_write_port((acpi_io_address)reset_reg->address, acpi_gbl_FADT.reset_value, ACPI_RESET_REGISTER_WIDTH); } else { /* Write the reset value to the reset register */ status = acpi_hw_write(acpi_gbl_FADT.reset_value, reset_reg); } return_ACPI_STATUS(status); } ACPI_EXPORT_SYMBOL(acpi_reset) /****************************************************************************** * * FUNCTION: acpi_read * * PARAMETERS: value - Where the value is returned * reg - GAS register structure * * RETURN: Status * * DESCRIPTION: Read from either memory or IO space. * * LIMITATIONS: <These limitations also apply to acpi_write> * bit_width must be exactly 8, 16, 32, or 64. * space_ID must be system_memory or system_IO. * bit_offset and access_width are currently ignored, as there has * not been a need to implement these. * ******************************************************************************/ acpi_status acpi_read(u64 *return_value, struct acpi_generic_address *reg) { acpi_status status; ACPI_FUNCTION_NAME(acpi_read); status = acpi_hw_read(return_value, reg); return (status); } ACPI_EXPORT_SYMBOL(acpi_read) /****************************************************************************** * * FUNCTION: acpi_write * * PARAMETERS: value - Value to be written * reg - GAS register structure * * RETURN: Status * * DESCRIPTION: Write to either memory or IO space. * ******************************************************************************/ acpi_status acpi_write(u64 value, struct acpi_generic_address *reg) { acpi_status status; ACPI_FUNCTION_NAME(acpi_write); status = acpi_hw_write(value, reg); return (status); } ACPI_EXPORT_SYMBOL(acpi_write) #if (!ACPI_REDUCED_HARDWARE) /******************************************************************************* * * FUNCTION: acpi_read_bit_register * * PARAMETERS: register_id - ID of ACPI Bit Register to access * return_value - Value that was read from the register, * normalized to bit position zero. * * RETURN: Status and the value read from the specified Register. Value * returned is normalized to bit0 (is shifted all the way right) * * DESCRIPTION: ACPI bit_register read function. Does not acquire the HW lock. * * SUPPORTS: Bit fields in PM1 Status, PM1 Enable, PM1 Control, and * PM2 Control. * * Note: The hardware lock is not required when reading the ACPI bit registers * since almost all of them are single bit and it does not matter that * the parent hardware register can be split across two physical * registers. The only multi-bit field is SLP_TYP in the PM1 control * register, but this field does not cross an 8-bit boundary (nor does * it make much sense to actually read this field.) * ******************************************************************************/ acpi_status acpi_read_bit_register(u32 register_id, u32 *return_value) { struct acpi_bit_register_info *bit_reg_info; u32 register_value; u32 value; acpi_status status; ACPI_FUNCTION_TRACE_U32(acpi_read_bit_register, register_id); /* Get the info structure corresponding to the requested ACPI Register */ bit_reg_info = acpi_hw_get_bit_register_info(register_id); if (!bit_reg_info) { return_ACPI_STATUS(AE_BAD_PARAMETER); } /* Read the entire parent register */ status = acpi_hw_register_read(bit_reg_info->parent_register, &register_value); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } /* Normalize the value that was read, mask off other bits */ value = ((register_value & bit_reg_info->access_bit_mask) >> bit_reg_info->bit_position); ACPI_DEBUG_PRINT((ACPI_DB_IO, "BitReg %X, ParentReg %X, Actual %8.8X, ReturnValue %8.8X\n", register_id, bit_reg_info->parent_register, register_value, value)); *return_value = value; return_ACPI_STATUS(AE_OK); } ACPI_EXPORT_SYMBOL(acpi_read_bit_register) /******************************************************************************* * * FUNCTION: acpi_write_bit_register * * PARAMETERS: register_id - ID of ACPI Bit Register to access * value - Value to write to the register, in bit * position zero. The bit is automatically * shifted to the correct position. * * RETURN: Status * * DESCRIPTION: ACPI Bit Register write function. Acquires the hardware lock * since most operations require a read/modify/write sequence. * * SUPPORTS: Bit fields in PM1 Status, PM1 Enable, PM1 Control, and * PM2 Control. * * Note that at this level, the fact that there may be actually two * hardware registers (A and B - and B may not exist) is abstracted. * ******************************************************************************/ acpi_status acpi_write_bit_register(u32 register_id, u32 value) { struct acpi_bit_register_info *bit_reg_info; acpi_cpu_flags lock_flags; u32 register_value; acpi_status status = AE_OK; ACPI_FUNCTION_TRACE_U32(acpi_write_bit_register, register_id); /* Get the info structure corresponding to the requested ACPI Register */ bit_reg_info = acpi_hw_get_bit_register_info(register_id); if (!bit_reg_info) { return_ACPI_STATUS(AE_BAD_PARAMETER); } lock_flags = acpi_os_acquire_raw_lock(acpi_gbl_hardware_lock); /* * At this point, we know that the parent register is one of the * following: PM1 Status, PM1 Enable, PM1 Control, or PM2 Control */ if (bit_reg_info->parent_register != ACPI_REGISTER_PM1_STATUS) { /* * 1) Case for PM1 Enable, PM1 Control, and PM2 Control * * Perform a register read to preserve the bits that we are not * interested in */ status = acpi_hw_register_read(bit_reg_info->parent_register, &register_value); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } /* * Insert the input bit into the value that was just read * and write the register */ ACPI_REGISTER_INSERT_VALUE(register_value, bit_reg_info->bit_position, bit_reg_info->access_bit_mask, value); status = acpi_hw_register_write(bit_reg_info->parent_register, register_value); } else { /* * 2) Case for PM1 Status * * The Status register is different from the rest. Clear an event * by writing 1, writing 0 has no effect. So, the only relevant * information is the single bit we're interested in, all others * should be written as 0 so they will be left unchanged. */ register_value = ACPI_REGISTER_PREPARE_BITS(value, bit_reg_info-> bit_position, bit_reg_info-> access_bit_mask); /* No need to write the register if value is all zeros */ if (register_value) { status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, register_value); } } ACPI_DEBUG_PRINT((ACPI_DB_IO, "BitReg %X, ParentReg %X, Value %8.8X, Actual %8.8X\n", register_id, bit_reg_info->parent_register, value, register_value)); unlock_and_exit: acpi_os_release_raw_lock(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } ACPI_EXPORT_SYMBOL(acpi_write_bit_register) #endif /* !ACPI_REDUCED_HARDWARE */ /******************************************************************************* * * FUNCTION: acpi_get_sleep_type_data * * PARAMETERS: sleep_state - Numeric sleep state * *sleep_type_a - Where SLP_TYPa is returned * *sleep_type_b - Where SLP_TYPb is returned * * RETURN: Status * * DESCRIPTION: Obtain the SLP_TYPa and SLP_TYPb values for the requested * sleep state via the appropriate \_Sx object. * * The sleep state package returned from the corresponding \_Sx_ object * must contain at least one integer. * * March 2005: * Added support for a package that contains two integers. This * goes against the ACPI specification which defines this object as a * package with one encoded DWORD integer. However, existing practice * by many BIOS vendors is to return a package with 2 or more integer * elements, at least one per sleep type (A/B). * * January 2013: * Therefore, we must be prepared to accept a package with either a * single integer or multiple integers. * * The single integer DWORD format is as follows: * BYTE 0 - Value for the PM1A SLP_TYP register * BYTE 1 - Value for the PM1B SLP_TYP register * BYTE 2-3 - Reserved * * The dual integer format is as follows: * Integer 0 - Value for the PM1A SLP_TYP register * Integer 1 - Value for the PM1A SLP_TYP register * ******************************************************************************/ acpi_status acpi_get_sleep_type_data(u8 sleep_state, u8 *sleep_type_a, u8 *sleep_type_b) { acpi_status status; struct acpi_evaluate_info *info; union acpi_operand_object **elements; ACPI_FUNCTION_TRACE(acpi_get_sleep_type_data); /* Validate parameters */ if ((sleep_state > ACPI_S_STATES_MAX) || !sleep_type_a || !sleep_type_b) { return_ACPI_STATUS(AE_BAD_PARAMETER); } /* Allocate the evaluation information block */ info = ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_evaluate_info)); if (!info) { return_ACPI_STATUS(AE_NO_MEMORY); } /* * Evaluate the \_Sx namespace object containing the register values * for this state */ info->relative_pathname = acpi_gbl_sleep_state_names[sleep_state]; status = acpi_ns_evaluate(info); if (ACPI_FAILURE(status)) { if (status == AE_NOT_FOUND) { /* The _Sx states are optional, ignore NOT_FOUND */ goto final_cleanup; } goto warning_cleanup; } /* Must have a return object */ if (!info->return_object) { ACPI_ERROR((AE_INFO, "No Sleep State object returned from [%s]", info->relative_pathname)); status = AE_AML_NO_RETURN_VALUE; goto warning_cleanup; } /* Return object must be of type Package */ if (info->return_object->common.type != ACPI_TYPE_PACKAGE) { ACPI_ERROR((AE_INFO, "Sleep State return object is not a Package")); status = AE_AML_OPERAND_TYPE; goto return_value_cleanup; } /* * Any warnings about the package length or the object types have * already been issued by the predefined name module -- there is no * need to repeat them here. */ elements = info->return_object->package.elements; switch (info->return_object->package.count) { case 0: status = AE_AML_PACKAGE_LIMIT; break; case 1: if (elements[0]->common.type != ACPI_TYPE_INTEGER) { status = AE_AML_OPERAND_TYPE; break; } /* A valid _Sx_ package with one integer */ *sleep_type_a = (u8)elements[0]->integer.value; *sleep_type_b = (u8)(elements[0]->integer.value >> 8); break; case 2: default: if ((elements[0]->common.type != ACPI_TYPE_INTEGER) || (elements[1]->common.type != ACPI_TYPE_INTEGER)) { status = AE_AML_OPERAND_TYPE; break; } /* A valid _Sx_ package with two integers */ *sleep_type_a = (u8)elements[0]->integer.value; *sleep_type_b = (u8)elements[1]->integer.value; break; } return_value_cleanup: acpi_ut_remove_reference(info->return_object); warning_cleanup: if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, "While evaluating Sleep State [%s]", info->relative_pathname)); } final_cleanup: ACPI_FREE(info); return_ACPI_STATUS(status); } ACPI_EXPORT_SYMBOL(acpi_get_sleep_type_data)
2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 15 15 15 15 16 16 16 16 15 15 16 16 16 377 10 376 9 377 377 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/stats.c * * procfs-based user access to generic RPC statistics. The stats files * reside in /proc/net/rpc. * * The read routines assume that the buffer passed in is just big enough. * If you implement an RPC service that has its own stats routine which * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE * limit. * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/metrics.h> #include <linux/rcupdate.h> #include <trace/events/sunrpc.h> #include "netns.h" #define RPCDBG_FACILITY RPCDBG_MISC /* * Get RPC client stats */ static int rpc_proc_show(struct seq_file *seq, void *v) { const struct rpc_stat *statp = seq->private; const struct rpc_program *prog = statp->program; unsigned int i, j; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u\n", statp->rpccnt, statp->rpcretrans, statp->rpcauthrefresh); for (i = 0; i < prog->nrvers; i++) { const struct rpc_version *vers = prog->version[i]; if (!vers) continue; seq_printf(seq, "proc%u %u", vers->number, vers->nrprocs); for (j = 0; j < vers->nrprocs; j++) seq_printf(seq, " %u", vers->counts[j]); seq_putc(seq, '\n'); } return 0; } static int rpc_proc_open(struct inode *inode, struct file *file) { return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { .proc_open = rpc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, }; /* * Get RPC server stats */ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { const struct svc_program *prog = statp->program; const struct svc_version *vers; unsigned int i, j, k; unsigned long count; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u %u %u\n", statp->rpccnt, statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt, statp->rpcbadfmt, statp->rpcbadauth, statp->rpcbadclnt); for (i = 0; i < prog->pg_nvers; i++) { vers = prog->pg_vers[i]; if (!vers) continue; seq_printf(seq, "proc%d %u", i, vers->vs_nproc); for (j = 0; j < vers->vs_nproc; j++) { count = 0; for_each_possible_cpu(k) count += per_cpu(vers->vs_count[j], k); seq_printf(seq, " %lu", count); } seq_putc(seq, '\n'); } } EXPORT_SYMBOL_GPL(svc_seq_show); /** * rpc_alloc_iostats - allocate an rpc_iostats structure * @clnt: RPC program, version, and xprt * */ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { struct rpc_iostats *stats; int i; stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL); if (stats) { for (i = 0; i < clnt->cl_maxproc; i++) spin_lock_init(&stats[i].om_lock); } return stats; } EXPORT_SYMBOL_GPL(rpc_alloc_iostats); /** * rpc_free_iostats - release an rpc_iostats structure * @stats: doomed rpc_iostats structure * */ void rpc_free_iostats(struct rpc_iostats *stats) { kfree(stats); } EXPORT_SYMBOL_GPL(rpc_free_iostats); /** * rpc_count_iostats_metrics - tally up per-task stats * @task: completed rpc_task * @op_metrics: stat structure for OP that will accumulate stats from @task */ void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; ktime_t backlog, execute, now; if (!op_metrics || !req) return; now = ktime_get(); spin_lock(&op_metrics->om_lock); op_metrics->om_ops++; /* kernel API: om_ops must never become larger than om_ntrans */ op_metrics->om_ntrans += max(req->rq_ntrans, 1); op_metrics->om_timeouts += task->tk_timeouts; op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; backlog = 0; if (ktime_to_ns(req->rq_xtime)) { backlog = ktime_sub(req->rq_xtime, task->tk_start); op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog); } op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); execute = ktime_sub(now, task->tk_start); op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); if (task->tk_status < 0) op_metrics->om_error_status++; spin_unlock(&op_metrics->om_lock); trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute); } EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); /** * rpc_count_iostats - tally up per-task stats * @task: completed rpc_task * @stats: array of stat structures * * Uses the statidx from @task */ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) { rpc_count_iostats_metrics(task, &stats[task->tk_msg.rpc_proc->p_statidx]); } EXPORT_SYMBOL_GPL(rpc_count_iostats); static void _print_name(struct seq_file *seq, unsigned int op, const struct rpc_procinfo *procs) { if (procs[op].p_name) seq_printf(seq, "\t%12s: ", procs[op].p_name); else if (op == 0) seq_printf(seq, "\t NULL: "); else seq_printf(seq, "\t%12u: ", op); } static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b) { a->om_ops += b->om_ops; a->om_ntrans += b->om_ntrans; a->om_timeouts += b->om_timeouts; a->om_bytes_sent += b->om_bytes_sent; a->om_bytes_recv += b->om_bytes_recv; a->om_queue = ktime_add(a->om_queue, b->om_queue); a->om_rtt = ktime_add(a->om_rtt, b->om_rtt); a->om_execute = ktime_add(a->om_execute, b->om_execute); a->om_error_status += b->om_error_status; } static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats, int op, const struct rpc_procinfo *procs) { _print_name(seq, op, procs); seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n", stats->om_ops, stats->om_ntrans, stats->om_timeouts, stats->om_bytes_sent, stats->om_bytes_recv, ktime_to_ms(stats->om_queue), ktime_to_ms(stats->om_rtt), ktime_to_ms(stats->om_execute), stats->om_error_status); } static int do_print_stats(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *seqv) { struct seq_file *seq = seqv; xprt->ops->print_stats(xprt, seq); return 0; } void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt) { unsigned int op, maxproc = clnt->cl_maxproc; if (!clnt->cl_metrics) return; seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS); seq_printf(seq, "p/v: %u/%u (%s)\n", clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name); rpc_clnt_iterate_for_each_xprt(clnt, do_print_stats, seq); seq_printf(seq, "\tper-op statistics\n"); for (op = 0; op < maxproc; op++) { struct rpc_iostats stats = {}; struct rpc_clnt *next = clnt; do { _add_rpc_iostats(&stats, &next->cl_metrics[op]); if (next == next->cl_parent) break; next = next->cl_parent; } while (next); _print_rpc_iostats(seq, &stats, op, clnt->cl_procinfo); } } EXPORT_SYMBOL_GPL(rpc_clnt_show_stats); /* * Register/unregister RPC proc files */ static inline struct proc_dir_entry * do_register(struct net *net, const char *name, void *data, const struct proc_ops *proc_ops) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc/%s\n", name); sn = net_generic(net, sunrpc_net_id); return proc_create_data(name, 0, sn->proc_net_rpc, proc_ops, data); } struct proc_dir_entry * rpc_proc_register(struct net *net, struct rpc_stat *statp) { return do_register(net, statp->program->name, statp, &rpc_proc_ops); } EXPORT_SYMBOL_GPL(rpc_proc_register); void rpc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); void svc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(svc_proc_unregister); int rpc_proc_init(struct net *net) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc\n"); sn = net_generic(net, sunrpc_net_id); sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net); if (sn->proc_net_rpc == NULL) return -ENOMEM; return 0; } void rpc_proc_exit(struct net *net) { dprintk("RPC: unregistering /proc/net/rpc\n"); remove_proc_entry("rpc", net->proc_net); }
1 2 2 2 2 5 2 2 2 5 4 2 2 2 2 2 2 2 9 8 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 3 5 8 8 8 8 8 8 8 7 7 7 1 7 7 7 7 11 9 10 10 5 3 9 10 10 10 10 1 1 9 8 1 9 9 9 7 7 2 3 6 5 2 2 2 2 2 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * Copyright (c) Nokia Corporation, 2007 * * Author: Artem Bityutskiy (Битюцкий Артём), * Frank Haverkamp */ /* * This file includes UBI initialization and building of UBI devices. * * When UBI is initialized, it attaches all the MTD devices specified as the * module load parameters or the kernel boot parameters. If MTD devices were * specified, UBI does not attach any MTD device, but it is possible to do * later using the "UBI control device". */ #include <linux/err.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/stringify.h> #include <linux/namei.h> #include <linux/stat.h> #include <linux/miscdevice.h> #include <linux/mtd/partitions.h> #include <linux/log2.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/of.h> #include <linux/slab.h> #include <linux/major.h> #include "ubi.h" /* Maximum length of the 'mtd=' parameter */ #define MTD_PARAM_LEN_MAX 64 /* Maximum number of comma-separated items in the 'mtd=' parameter */ #define MTD_PARAM_MAX_COUNT 6 /* Maximum value for the number of bad PEBs per 1024 PEBs */ #define MAX_MTD_UBI_BEB_LIMIT 768 #ifdef CONFIG_MTD_UBI_MODULE #define ubi_is_module() 1 #else #define ubi_is_module() 0 #endif /** * struct mtd_dev_param - MTD device parameter description data structure. * @name: MTD character device node path, MTD device name, or MTD device number * string * @ubi_num: UBI number * @vid_hdr_offs: VID header offset * @max_beb_per1024: maximum expected number of bad PEBs per 1024 PEBs * @enable_fm: enable fastmap when value is non-zero * @need_resv_pool: reserve pool->max_size pebs when value is none-zero */ struct mtd_dev_param { char name[MTD_PARAM_LEN_MAX]; int ubi_num; int vid_hdr_offs; int max_beb_per1024; int enable_fm; int need_resv_pool; }; /* Numbers of elements set in the @mtd_dev_param array */ static int mtd_devs; /* MTD devices specification parameters */ static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES]; #ifdef CONFIG_MTD_UBI_FASTMAP /* UBI module parameter to enable fastmap automatically on non-fastmap images */ static bool fm_autoconvert; static bool fm_debug; #endif /* Slab cache for wear-leveling entries */ struct kmem_cache *ubi_wl_entry_slab; /* UBI control character device */ static struct miscdevice ubi_ctrl_cdev = { .minor = MISC_DYNAMIC_MINOR, .name = "ubi_ctrl", .fops = &ubi_ctrl_cdev_operations, }; /* All UBI devices in system */ static struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; /* Serializes UBI devices creations and removals */ DEFINE_MUTEX(ubi_devices_mutex); /* Protects @ubi_devices, @ubi->ref_count and @ubi->is_dead */ static DEFINE_SPINLOCK(ubi_devices_lock); /* "Show" method for files in '/<sysfs>/class/ubi/' */ /* UBI version attribute ('/<sysfs>/class/ubi/version') */ static ssize_t version_show(const struct class *class, const struct class_attribute *attr, char *buf) { return sprintf(buf, "%d\n", UBI_VERSION); } static CLASS_ATTR_RO(version); static struct attribute *ubi_class_attrs[] = { &class_attr_version.attr, NULL, }; ATTRIBUTE_GROUPS(ubi_class); /* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */ const struct class ubi_class = { .name = UBI_NAME_STR, .class_groups = ubi_class_groups, }; static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf); /* UBI device attributes (correspond to files in '/<sysfs>/class/ubi/ubiX') */ static struct device_attribute dev_eraseblock_size = __ATTR(eraseblock_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_avail_eraseblocks = __ATTR(avail_eraseblocks, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_total_eraseblocks = __ATTR(total_eraseblocks, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_volumes_count = __ATTR(volumes_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_max_ec = __ATTR(max_ec, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_reserved_for_bad = __ATTR(reserved_for_bad, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bad_peb_count = __ATTR(bad_peb_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_max_vol_count = __ATTR(max_vol_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_min_io_size = __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bgt_enabled = __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_mtd_num = __ATTR(mtd_num, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_ro_mode = __ATTR(ro_mode, S_IRUGO, dev_attribute_show, NULL); /** * ubi_volume_notify - send a volume change notification. * @ubi: UBI device description object * @vol: volume description object of the changed volume * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc) * * This is a helper function which notifies all subscribers about a volume * change event (creation, removal, re-sizing, re-naming, updating). Returns * zero in case of success and a negative error code in case of failure. */ int ubi_volume_notify(struct ubi_device *ubi, struct ubi_volume *vol, int ntype) { int ret; struct ubi_notification nt; ubi_do_get_device_info(ubi, &nt.di); ubi_do_get_volume_info(ubi, vol, &nt.vi); switch (ntype) { case UBI_VOLUME_ADDED: case UBI_VOLUME_REMOVED: case UBI_VOLUME_RESIZED: case UBI_VOLUME_RENAMED: ret = ubi_update_fastmap(ubi); if (ret) ubi_msg(ubi, "Unable to write a new fastmap: %i", ret); } return blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt); } /** * ubi_notify_all - send a notification to all volumes. * @ubi: UBI device description object * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc) * @nb: the notifier to call * * This function walks all volumes of UBI device @ubi and sends the @ntype * notification for each volume. If @nb is %NULL, then all registered notifiers * are called, otherwise only the @nb notifier is called. Returns the number of * sent notifications. */ int ubi_notify_all(struct ubi_device *ubi, int ntype, struct notifier_block *nb) { struct ubi_notification nt; int i, count = 0; ubi_do_get_device_info(ubi, &nt.di); mutex_lock(&ubi->device_mutex); for (i = 0; i < ubi->vtbl_slots; i++) { /* * Since the @ubi->device is locked, and we are not going to * change @ubi->volumes, we do not have to lock * @ubi->volumes_lock. */ if (!ubi->volumes[i]) continue; ubi_do_get_volume_info(ubi, ubi->volumes[i], &nt.vi); if (nb) nb->notifier_call(nb, ntype, &nt); else blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt); count += 1; } mutex_unlock(&ubi->device_mutex); return count; } /** * ubi_enumerate_volumes - send "add" notification for all existing volumes. * @nb: the notifier to call * * This function walks all UBI devices and volumes and sends the * %UBI_VOLUME_ADDED notification for each volume. If @nb is %NULL, then all * registered notifiers are called, otherwise only the @nb notifier is called. * Returns the number of sent notifications. */ int ubi_enumerate_volumes(struct notifier_block *nb) { int i, count = 0; /* * Since the @ubi_devices_mutex is locked, and we are not going to * change @ubi_devices, we do not have to lock @ubi_devices_lock. */ for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; if (!ubi) continue; count += ubi_notify_all(ubi, UBI_VOLUME_ADDED, nb); } return count; } /** * ubi_get_device - get UBI device. * @ubi_num: UBI device number * * This function returns UBI device description object for UBI device number * @ubi_num, or %NULL if the device does not exist. This function increases the * device reference count to prevent removal of the device. In other words, the * device cannot be removed if its reference count is not zero. */ struct ubi_device *ubi_get_device(int ubi_num) { struct ubi_device *ubi; spin_lock(&ubi_devices_lock); ubi = ubi_devices[ubi_num]; if (ubi && ubi->is_dead) ubi = NULL; if (ubi) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); } spin_unlock(&ubi_devices_lock); return ubi; } /** * ubi_put_device - drop an UBI device reference. * @ubi: UBI device description object */ void ubi_put_device(struct ubi_device *ubi) { spin_lock(&ubi_devices_lock); ubi->ref_count -= 1; put_device(&ubi->dev); spin_unlock(&ubi_devices_lock); } /** * ubi_get_by_major - get UBI device by character device major number. * @major: major number * * This function is similar to 'ubi_get_device()', but it searches the device * by its major number. */ struct ubi_device *ubi_get_by_major(int major) { int i; struct ubi_device *ubi; spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); spin_unlock(&ubi_devices_lock); return ubi; } } spin_unlock(&ubi_devices_lock); return NULL; } /** * ubi_major2num - get UBI device number by character device major number. * @major: major number * * This function searches UBI device number object by its major number. If UBI * device was not found, this function returns -ENODEV, otherwise the UBI device * number is returned. */ int ubi_major2num(int major) { int i, ubi_num = -ENODEV; spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_num = ubi->ubi_num; break; } } spin_unlock(&ubi_devices_lock); return ubi_num; } /* "Show" method for files in '/<sysfs>/class/ubi/ubiX/' */ static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct ubi_device *ubi; /* * The below code looks weird, but it actually makes sense. We get the * UBI device reference from the contained 'struct ubi_device'. But it * is unclear if the device was removed or not yet. Indeed, if the * device was removed before we increased its reference count, * 'ubi_get_device()' will return -ENODEV and we fail. * * Remember, 'struct ubi_device' is freed in the release function, so * we still can use 'ubi->ubi_num'. */ ubi = container_of(dev, struct ubi_device, dev); if (attr == &dev_eraseblock_size) ret = sprintf(buf, "%d\n", ubi->leb_size); else if (attr == &dev_avail_eraseblocks) ret = sprintf(buf, "%d\n", ubi->avail_pebs); else if (attr == &dev_total_eraseblocks) ret = sprintf(buf, "%d\n", ubi->good_peb_count); else if (attr == &dev_volumes_count) ret = sprintf(buf, "%d\n", ubi->vol_count - UBI_INT_VOL_COUNT); else if (attr == &dev_max_ec) ret = sprintf(buf, "%d\n", ubi->max_ec); else if (attr == &dev_reserved_for_bad) ret = sprintf(buf, "%d\n", ubi->beb_rsvd_pebs); else if (attr == &dev_bad_peb_count) ret = sprintf(buf, "%d\n", ubi->bad_peb_count); else if (attr == &dev_max_vol_count) ret = sprintf(buf, "%d\n", ubi->vtbl_slots); else if (attr == &dev_min_io_size) ret = sprintf(buf, "%d\n", ubi->min_io_size); else if (attr == &dev_bgt_enabled) ret = sprintf(buf, "%d\n", ubi->thread_enabled); else if (attr == &dev_mtd_num) ret = sprintf(buf, "%d\n", ubi->mtd->index); else if (attr == &dev_ro_mode) ret = sprintf(buf, "%d\n", ubi->ro_mode); else ret = -EINVAL; return ret; } static struct attribute *ubi_dev_attrs[] = { &dev_eraseblock_size.attr, &dev_avail_eraseblocks.attr, &dev_total_eraseblocks.attr, &dev_volumes_count.attr, &dev_max_ec.attr, &dev_reserved_for_bad.attr, &dev_bad_peb_count.attr, &dev_max_vol_count.attr, &dev_min_io_size.attr, &dev_bgt_enabled.attr, &dev_mtd_num.attr, &dev_ro_mode.attr, NULL }; ATTRIBUTE_GROUPS(ubi_dev); static void dev_release(struct device *dev) { struct ubi_device *ubi = container_of(dev, struct ubi_device, dev); kfree(ubi); } /** * kill_volumes - destroy all user volumes. * @ubi: UBI device description object */ static void kill_volumes(struct ubi_device *ubi) { int i; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) ubi_free_volume(ubi, ubi->volumes[i]); } /** * uif_init - initialize user interfaces for an UBI device. * @ubi: UBI device description object * * This function initializes various user interfaces for an UBI device. If the * initialization fails at an early stage, this function frees all the * resources it allocated, returns an error. * * This function returns zero in case of success and a negative error code in * case of failure. */ static int uif_init(struct ubi_device *ubi) { int i, err; dev_t dev; sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num); /* * Major numbers for the UBI character devices are allocated * dynamically. Major numbers of volume character devices are * equivalent to ones of the corresponding UBI character device. Minor * numbers of UBI character devices are 0, while minor numbers of * volume character devices start from 1. Thus, we allocate one major * number and ubi->vtbl_slots + 1 minor numbers. */ err = alloc_chrdev_region(&dev, 0, ubi->vtbl_slots + 1, ubi->ubi_name); if (err) { ubi_err(ubi, "cannot register UBI character devices"); return err; } ubi->dev.devt = dev; ubi_assert(MINOR(dev) == 0); cdev_init(&ubi->cdev, &ubi_cdev_operations); dbg_gen("%s major is %u", ubi->ubi_name, MAJOR(dev)); ubi->cdev.owner = THIS_MODULE; dev_set_name(&ubi->dev, UBI_NAME_STR "%d", ubi->ubi_num); err = cdev_device_add(&ubi->cdev, &ubi->dev); if (err) goto out_unreg; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) { err = ubi_add_volume(ubi, ubi->volumes[i]); if (err) { ubi_err(ubi, "cannot add volume %d", i); ubi->volumes[i] = NULL; goto out_volumes; } } return 0; out_volumes: kill_volumes(ubi); cdev_device_del(&ubi->cdev, &ubi->dev); out_unreg: unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); ubi_err(ubi, "cannot initialize UBI %s, error %d", ubi->ubi_name, err); return err; } /** * uif_close - close user interfaces for an UBI device. * @ubi: UBI device description object * * Note, since this function un-registers UBI volume device objects (@vol->dev), * the memory allocated voe the volumes is freed as well (in the release * function). */ static void uif_close(struct ubi_device *ubi) { kill_volumes(ubi); cdev_device_del(&ubi->cdev, &ubi->dev); unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); } /** * ubi_free_volumes_from - free volumes from specific index. * @ubi: UBI device description object * @from: the start index used for volume free. */ static void ubi_free_volumes_from(struct ubi_device *ubi, int from) { int i; for (i = from; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { if (!ubi->volumes[i] || ubi->volumes[i]->is_dead) continue; ubi_eba_replace_table(ubi->volumes[i], NULL); ubi_fastmap_destroy_checkmap(ubi->volumes[i]); kfree(ubi->volumes[i]); ubi->volumes[i] = NULL; } } /** * ubi_free_all_volumes - free all volumes. * @ubi: UBI device description object */ void ubi_free_all_volumes(struct ubi_device *ubi) { ubi_free_volumes_from(ubi, 0); } /** * ubi_free_internal_volumes - free internal volumes. * @ubi: UBI device description object */ void ubi_free_internal_volumes(struct ubi_device *ubi) { ubi_free_volumes_from(ubi, ubi->vtbl_slots); } static int get_bad_peb_limit(const struct ubi_device *ubi, int max_beb_per1024) { int limit, device_pebs; uint64_t device_size; if (!max_beb_per1024) { /* * Since max_beb_per1024 has not been set by the user in either * the cmdline or Kconfig, use mtd_max_bad_blocks to set the * limit if it is supported by the device. */ limit = mtd_max_bad_blocks(ubi->mtd, 0, ubi->mtd->size); if (limit < 0) return 0; return limit; } /* * Here we are using size of the entire flash chip and * not just the MTD partition size because the maximum * number of bad eraseblocks is a percentage of the * whole device and bad eraseblocks are not fairly * distributed over the flash chip. So the worst case * is that all the bad eraseblocks of the chip are in * the MTD partition we are attaching (ubi->mtd). */ device_size = mtd_get_device_size(ubi->mtd); device_pebs = mtd_div_by_eb(device_size, ubi->mtd); limit = mult_frac(device_pebs, max_beb_per1024, 1024); /* Round it up */ if (mult_frac(limit, 1024, max_beb_per1024) < device_pebs) limit += 1; return limit; } /** * io_init - initialize I/O sub-system for a given UBI device. * @ubi: UBI device description object * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are * assumed: * o EC header is always at offset zero - this cannot be changed; * o VID header starts just after the EC header at the closest address * aligned to @io->hdrs_min_io_size; * o data starts just after the VID header at the closest address aligned to * @io->min_io_size * * This function returns zero in case of success and a negative error code in * case of failure. */ static int io_init(struct ubi_device *ubi, int max_beb_per1024) { dbg_gen("sizeof(struct ubi_ainf_peb) %zu", sizeof(struct ubi_ainf_peb)); dbg_gen("sizeof(struct ubi_wl_entry) %zu", sizeof(struct ubi_wl_entry)); if (ubi->mtd->numeraseregions != 0) { /* * Some flashes have several erase regions. Different regions * may have different eraseblock size and other * characteristics. It looks like mostly multi-region flashes * have one "main" region and one or more small regions to * store boot loader code or boot parameters or whatever. I * guess we should just pick the largest region. But this is * not implemented. */ ubi_err(ubi, "multiple regions, not implemented"); return -EINVAL; } if (ubi->vid_hdr_offset < 0) return -EINVAL; /* * Note, in this implementation we support MTD devices with 0x7FFFFFFF * physical eraseblocks maximum. */ ubi->peb_size = ubi->mtd->erasesize; ubi->peb_count = mtd_div_by_eb(ubi->mtd->size, ubi->mtd); ubi->flash_size = ubi->mtd->size; if (mtd_can_have_bb(ubi->mtd)) { ubi->bad_allowed = 1; ubi->bad_peb_limit = get_bad_peb_limit(ubi, max_beb_per1024); } if (ubi->mtd->type == MTD_NORFLASH) ubi->nor_flash = 1; ubi->min_io_size = ubi->mtd->writesize; ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft; /* * Make sure minimal I/O unit is power of 2. Note, there is no * fundamental reason for this assumption. It is just an optimization * which allows us to avoid costly division operations. */ if (!is_power_of_2(ubi->min_io_size)) { ubi_err(ubi, "min. I/O unit (%d) is not power of 2", ubi->min_io_size); return -EINVAL; } ubi_assert(ubi->hdrs_min_io_size > 0); ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size); ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0); ubi->max_write_size = ubi->mtd->writebufsize; /* * Maximum write size has to be greater or equivalent to min. I/O * size, and be multiple of min. I/O size. */ if (ubi->max_write_size < ubi->min_io_size || ubi->max_write_size % ubi->min_io_size || !is_power_of_2(ubi->max_write_size)) { ubi_err(ubi, "bad write buffer size %d for %d min. I/O unit", ubi->max_write_size, ubi->min_io_size); return -EINVAL; } /* Calculate default aligned sizes of EC and VID headers */ ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); dbg_gen("min_io_size %d", ubi->min_io_size); dbg_gen("max_write_size %d", ubi->max_write_size); dbg_gen("hdrs_min_io_size %d", ubi->hdrs_min_io_size); dbg_gen("ec_hdr_alsize %d", ubi->ec_hdr_alsize); dbg_gen("vid_hdr_alsize %d", ubi->vid_hdr_alsize); if (ubi->vid_hdr_offset == 0) /* Default offset */ ubi->vid_hdr_offset = ubi->vid_hdr_aloffset = ubi->ec_hdr_alsize; else { ubi->vid_hdr_aloffset = ubi->vid_hdr_offset & ~(ubi->hdrs_min_io_size - 1); ubi->vid_hdr_shift = ubi->vid_hdr_offset - ubi->vid_hdr_aloffset; } /* * Memory allocation for VID header is ubi->vid_hdr_alsize * which is described in comments in io.c. * Make sure VID header shift + UBI_VID_HDR_SIZE not exceeds * ubi->vid_hdr_alsize, so that all vid header operations * won't access memory out of bounds. */ if ((ubi->vid_hdr_shift + UBI_VID_HDR_SIZE) > ubi->vid_hdr_alsize) { ubi_err(ubi, "Invalid VID header offset %d, VID header shift(%d)" " + VID header size(%zu) > VID header aligned size(%d).", ubi->vid_hdr_offset, ubi->vid_hdr_shift, UBI_VID_HDR_SIZE, ubi->vid_hdr_alsize); return -EINVAL; } /* Similar for the data offset */ ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE; ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); dbg_gen("vid_hdr_offset %d", ubi->vid_hdr_offset); dbg_gen("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset); dbg_gen("vid_hdr_shift %d", ubi->vid_hdr_shift); dbg_gen("leb_start %d", ubi->leb_start); /* The shift must be aligned to 32-bit boundary */ if (ubi->vid_hdr_shift % 4) { ubi_err(ubi, "unaligned VID header shift %d", ubi->vid_hdr_shift); return -EINVAL; } /* Check sanity */ if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE || ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE || ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE || ubi->leb_start & (ubi->min_io_size - 1)) { ubi_err(ubi, "bad VID header (%d) or data offsets (%d)", ubi->vid_hdr_offset, ubi->leb_start); return -EINVAL; } /* * Set maximum amount of physical erroneous eraseblocks to be 10%. * Erroneous PEB are those which have read errors. */ ubi->max_erroneous = ubi->peb_count / 10; if (ubi->max_erroneous < 16) ubi->max_erroneous = 16; dbg_gen("max_erroneous %d", ubi->max_erroneous); /* * It may happen that EC and VID headers are situated in one minimal * I/O unit. In this case we can only accept this UBI image in * read-only mode. */ if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) { ubi_warn(ubi, "EC and VID headers are in the same minimal I/O unit, switch to read-only mode"); ubi->ro_mode = 1; } ubi->leb_size = ubi->peb_size - ubi->leb_start; if (!(ubi->mtd->flags & MTD_WRITEABLE)) { ubi_msg(ubi, "MTD device %d is write-protected, attach in read-only mode", ubi->mtd->index); ubi->ro_mode = 1; } /* * Note, ideally, we have to initialize @ubi->bad_peb_count here. But * unfortunately, MTD does not provide this information. We should loop * over all physical eraseblocks and invoke mtd->block_is_bad() for * each physical eraseblock. So, we leave @ubi->bad_peb_count * uninitialized so far. */ return 0; } /** * autoresize - re-size the volume which has the "auto-resize" flag set. * @ubi: UBI device description object * @vol_id: ID of the volume to re-size * * This function re-sizes the volume marked by the %UBI_VTBL_AUTORESIZE_FLG in * the volume table to the largest possible size. See comments in ubi-header.h * for more description of the flag. Returns zero in case of success and a * negative error code in case of failure. */ static int autoresize(struct ubi_device *ubi, int vol_id) { struct ubi_volume_desc desc; struct ubi_volume *vol = ubi->volumes[vol_id]; int err, old_reserved_pebs = vol->reserved_pebs; if (ubi->ro_mode) { ubi_warn(ubi, "skip auto-resize because of R/O mode"); return 0; } /* * Clear the auto-resize flag in the volume in-memory copy of the * volume table, and 'ubi_resize_volume()' will propagate this change * to the flash. */ ubi->vtbl[vol_id].flags &= ~UBI_VTBL_AUTORESIZE_FLG; if (ubi->avail_pebs == 0) { struct ubi_vtbl_record vtbl_rec; /* * No available PEBs to re-size the volume, clear the flag on * flash and exit. */ vtbl_rec = ubi->vtbl[vol_id]; err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); if (err) ubi_err(ubi, "cannot clean auto-resize flag for volume %d", vol_id); } else { desc.vol = vol; err = ubi_resize_volume(&desc, old_reserved_pebs + ubi->avail_pebs); if (err) ubi_err(ubi, "cannot auto-resize volume %d", vol_id); } if (err) return err; ubi_msg(ubi, "volume %d (\"%s\") re-sized from %d to %d LEBs", vol_id, vol->name, old_reserved_pebs, vol->reserved_pebs); return 0; } /** * ubi_attach_mtd_dev - attach an MTD device. * @mtd: MTD device description object * @ubi_num: number to assign to the new UBI device * @vid_hdr_offset: VID header offset * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * @disable_fm: whether disable fastmap * @need_resv_pool: whether reserve pebs to fill fm_pool * * This function attaches MTD device @mtd_dev to UBI and assign @ubi_num number * to the newly created UBI device, unless @ubi_num is %UBI_DEV_NUM_AUTO, in * which case this function finds a vacant device number and assigns it * automatically. Returns the new UBI device number in case of success and a * negative error code in case of failure. * * If @disable_fm is true, ubi doesn't create new fastmap even the module param * 'fm_autoconvert' is set, and existed old fastmap will be destroyed after * doing full scanning. * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset, int max_beb_per1024, bool disable_fm, bool need_resv_pool) { struct ubi_device *ubi; int i, err; if (max_beb_per1024 < 0 || max_beb_per1024 > MAX_MTD_UBI_BEB_LIMIT) return -EINVAL; if (!max_beb_per1024) max_beb_per1024 = CONFIG_MTD_UBI_BEB_LIMIT; /* * Check if we already have the same MTD device attached. * * Note, this function assumes that UBI devices creations and deletions * are serialized, so it does not take the &ubi_devices_lock. */ for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && mtd->index == ubi->mtd->index) { pr_err("ubi: mtd%d is already attached to ubi%d\n", mtd->index, i); return -EEXIST; } } /* * Make sure this MTD device is not emulated on top of an UBI volume * already. Well, generally this recursion works fine, but there are * different problems like the UBI module takes a reference to itself * by attaching (and thus, opening) the emulated MTD device. This * results in inability to unload the module. And in general it makes * no sense to attach emulated MTD devices, so we prohibit this. */ if (mtd->type == MTD_UBIVOLUME) { pr_err("ubi: refuse attaching mtd%d - it is already emulated on top of UBI\n", mtd->index); return -EINVAL; } /* * Both UBI and UBIFS have been designed for SLC NAND and NOR flashes. * MLC NAND is different and needs special care, otherwise UBI or UBIFS * will die soon and you will lose all your data. * Relax this rule if the partition we're attaching to operates in SLC * mode. */ if (mtd->type == MTD_MLCNANDFLASH && !(mtd->flags & MTD_SLC_ON_MLC_EMULATION)) { pr_err("ubi: refuse attaching mtd%d - MLC NAND is not supported\n", mtd->index); return -EINVAL; } /* UBI cannot work on flashes with zero erasesize. */ if (!mtd->erasesize) { pr_err("ubi: refuse attaching mtd%d - zero erasesize flash is not supported\n", mtd->index); return -EINVAL; } if (ubi_num == UBI_DEV_NUM_AUTO) { /* Search for an empty slot in the @ubi_devices array */ for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++) if (!ubi_devices[ubi_num]) break; if (ubi_num == UBI_MAX_DEVICES) { pr_err("ubi: only %d UBI devices may be created\n", UBI_MAX_DEVICES); return -ENFILE; } } else { if (ubi_num >= UBI_MAX_DEVICES) return -EINVAL; /* Make sure ubi_num is not busy */ if (ubi_devices[ubi_num]) { pr_err("ubi: ubi%i already exists\n", ubi_num); return -EEXIST; } } ubi = kzalloc(sizeof(struct ubi_device), GFP_KERNEL); if (!ubi) return -ENOMEM; device_initialize(&ubi->dev); ubi->dev.release = dev_release; ubi->dev.class = &ubi_class; ubi->dev.groups = ubi_dev_groups; ubi->dev.parent = &mtd->dev; ubi->mtd = mtd; ubi->ubi_num = ubi_num; ubi->vid_hdr_offset = vid_hdr_offset; ubi->autoresize_vol_id = -1; #ifdef CONFIG_MTD_UBI_FASTMAP ubi->fm_pool.used = ubi->fm_pool.size = 0; ubi->fm_wl_pool.used = ubi->fm_wl_pool.size = 0; /* * fm_pool.max_size is 5% of the total number of PEBs but it's also * between UBI_FM_MAX_POOL_SIZE and UBI_FM_MIN_POOL_SIZE. */ ubi->fm_pool.max_size = min(((int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd) / 100) * 5, UBI_FM_MAX_POOL_SIZE); ubi->fm_pool.max_size = max(ubi->fm_pool.max_size, UBI_FM_MIN_POOL_SIZE); ubi->fm_wl_pool.max_size = ubi->fm_pool.max_size / 2; ubi->fm_pool_rsv_cnt = need_resv_pool ? ubi->fm_pool.max_size : 0; ubi->fm_disabled = (!fm_autoconvert || disable_fm) ? 1 : 0; if (fm_debug) ubi_enable_dbg_chk_fastmap(ubi); if (!ubi->fm_disabled && (int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd) <= UBI_FM_MAX_START) { ubi_err(ubi, "More than %i PEBs are needed for fastmap, sorry.", UBI_FM_MAX_START); ubi->fm_disabled = 1; } ubi_msg(ubi, "default fastmap pool size: %d", ubi->fm_pool.max_size); ubi_msg(ubi, "default fastmap WL pool size: %d", ubi->fm_wl_pool.max_size); #else ubi->fm_disabled = 1; #endif mutex_init(&ubi->buf_mutex); mutex_init(&ubi->ckvol_mutex); mutex_init(&ubi->device_mutex); spin_lock_init(&ubi->volumes_lock); init_rwsem(&ubi->fm_protect); init_rwsem(&ubi->fm_eba_sem); ubi_msg(ubi, "attaching mtd%d", mtd->index); err = io_init(ubi, max_beb_per1024); if (err) goto out_free; err = -ENOMEM; ubi->peb_buf = vmalloc(ubi->peb_size); if (!ubi->peb_buf) goto out_free; #ifdef CONFIG_MTD_UBI_FASTMAP ubi->fm_size = ubi_calc_fm_size(ubi); ubi->fm_buf = vzalloc(ubi->fm_size); if (!ubi->fm_buf) goto out_free; #endif err = ubi_attach(ubi, disable_fm ? 1 : 0); if (err) { ubi_err(ubi, "failed to attach mtd%d, error %d", mtd->index, err); goto out_free; } if (ubi->autoresize_vol_id != -1) { err = autoresize(ubi, ubi->autoresize_vol_id); if (err) goto out_detach; } err = uif_init(ubi); if (err) goto out_detach; err = ubi_debugfs_init_dev(ubi); if (err) goto out_uif; ubi->bgt_thread = kthread_create(ubi_thread, ubi, "%s", ubi->bgt_name); if (IS_ERR(ubi->bgt_thread)) { err = PTR_ERR(ubi->bgt_thread); ubi_err(ubi, "cannot spawn \"%s\", error %d", ubi->bgt_name, err); goto out_debugfs; } ubi_msg(ubi, "attached mtd%d (name \"%s\", size %llu MiB)", mtd->index, mtd->name, ubi->flash_size >> 20); ubi_msg(ubi, "PEB size: %d bytes (%d KiB), LEB size: %d bytes", ubi->peb_size, ubi->peb_size >> 10, ubi->leb_size); ubi_msg(ubi, "min./max. I/O unit sizes: %d/%d, sub-page size %d", ubi->min_io_size, ubi->max_write_size, ubi->hdrs_min_io_size); ubi_msg(ubi, "VID header offset: %d (aligned %d), data offset: %d", ubi->vid_hdr_offset, ubi->vid_hdr_aloffset, ubi->leb_start); ubi_msg(ubi, "good PEBs: %d, bad PEBs: %d, corrupted PEBs: %d", ubi->good_peb_count, ubi->bad_peb_count, ubi->corr_peb_count); ubi_msg(ubi, "user volume: %d, internal volumes: %d, max. volumes count: %d", ubi->vol_count - UBI_INT_VOL_COUNT, UBI_INT_VOL_COUNT, ubi->vtbl_slots); ubi_msg(ubi, "max/mean erase counter: %d/%d, WL threshold: %d, image sequence number: %u", ubi->max_ec, ubi->mean_ec, CONFIG_MTD_UBI_WL_THRESHOLD, ubi->image_seq); ubi_msg(ubi, "available PEBs: %d, total reserved PEBs: %d, PEBs reserved for bad PEB handling: %d", ubi->avail_pebs, ubi->rsvd_pebs, ubi->beb_rsvd_pebs); /* * The below lock makes sure we do not race with 'ubi_thread()' which * checks @ubi->thread_enabled. Otherwise we may fail to wake it up. */ spin_lock(&ubi->wl_lock); ubi->thread_enabled = 1; wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); ubi_devices[ubi_num] = ubi; ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; out_debugfs: ubi_debugfs_exit_dev(ubi); out_uif: uif_close(ubi); out_detach: ubi_wl_close(ubi); ubi_free_all_volumes(ubi); vfree(ubi->vtbl); out_free: vfree(ubi->peb_buf); vfree(ubi->fm_buf); put_device(&ubi->dev); return err; } /** * ubi_detach_mtd_dev - detach an MTD device. * @ubi_num: UBI device number to detach from * @anyway: detach MTD even if device reference count is not zero * * This function destroys an UBI device number @ubi_num and detaches the * underlying MTD device. Returns zero in case of success and %-EBUSY if the * UBI device is busy and cannot be destroyed, and %-EINVAL if it does not * exist. * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_detach_mtd_dev(int ubi_num, int anyway) { struct ubi_device *ubi; if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES) return -EINVAL; ubi = ubi_get_device(ubi_num); if (!ubi) return -EINVAL; spin_lock(&ubi_devices_lock); ubi->ref_count -= 1; if (ubi->ref_count) { if (!anyway) { spin_unlock(&ubi_devices_lock); return -EBUSY; } /* This may only happen if there is a bug */ ubi_err(ubi, "%s reference count %d, destroy anyway", ubi->ubi_name, ubi->ref_count); } ubi->is_dead = true; spin_unlock(&ubi_devices_lock); ubi_notify_all(ubi, UBI_VOLUME_SHUTDOWN, NULL); spin_lock(&ubi_devices_lock); put_device(&ubi->dev); ubi_devices[ubi_num] = NULL; spin_unlock(&ubi_devices_lock); ubi_assert(ubi_num == ubi->ubi_num); ubi_notify_all(ubi, UBI_VOLUME_REMOVED, NULL); ubi_msg(ubi, "detaching mtd%d", ubi->mtd->index); #ifdef CONFIG_MTD_UBI_FASTMAP /* If we don't write a new fastmap at detach time we lose all * EC updates that have been made since the last written fastmap. * In case of fastmap debugging we omit the update to simulate an * unclean shutdown. */ if (!ubi_dbg_chk_fastmap(ubi)) ubi_update_fastmap(ubi); #endif /* * Before freeing anything, we have to stop the background thread to * prevent it from doing anything on this device while we are freeing. */ if (ubi->bgt_thread) kthread_stop(ubi->bgt_thread); #ifdef CONFIG_MTD_UBI_FASTMAP cancel_work_sync(&ubi->fm_work); #endif ubi_debugfs_exit_dev(ubi); uif_close(ubi); ubi_wl_close(ubi); ubi_free_internal_volumes(ubi); vfree(ubi->vtbl); vfree(ubi->peb_buf); vfree(ubi->fm_buf); ubi_msg(ubi, "mtd%d is detached", ubi->mtd->index); put_mtd_device(ubi->mtd); put_device(&ubi->dev); return 0; } /** * open_mtd_by_chdev - open an MTD device by its character device node path. * @mtd_dev: MTD character device node path * * This helper function opens an MTD device by its character node device path. * Returns MTD device description object in case of success and a negative * error code in case of failure. */ static struct mtd_info * __init open_mtd_by_chdev(const char *mtd_dev) { int err, minor; struct path path; struct kstat stat; /* Probably this is an MTD character device node path */ err = kern_path(mtd_dev, LOOKUP_FOLLOW, &path); if (err) return ERR_PTR(err); err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); path_put(&path); if (err) return ERR_PTR(err); /* MTD device number is defined by the major / minor numbers */ if (MAJOR(stat.rdev) != MTD_CHAR_MAJOR || !S_ISCHR(stat.mode)) return ERR_PTR(-EINVAL); minor = MINOR(stat.rdev); if (minor & 1) /* * Just do not think the "/dev/mtdrX" devices support is need, * so do not support them to avoid doing extra work. */ return ERR_PTR(-EINVAL); return get_mtd_device(NULL, minor / 2); } /** * open_mtd_device - open MTD device by name, character device path, or number. * @mtd_dev: name, character device node path, or MTD device device number * * This function tries to open and MTD device described by @mtd_dev string, * which is first treated as ASCII MTD device number, and if it is not true, it * is treated as MTD device name, and if that is also not true, it is treated * as MTD character device node path. Returns MTD device description object in * case of success and a negative error code in case of failure. */ static struct mtd_info * __init open_mtd_device(const char *mtd_dev) { struct mtd_info *mtd; int mtd_num; char *endp; mtd_num = simple_strtoul(mtd_dev, &endp, 0); if (*endp != '\0' || mtd_dev == endp) { /* * This does not look like an ASCII integer, probably this is * MTD device name. */ mtd = get_mtd_device_nm(mtd_dev); if (PTR_ERR(mtd) == -ENODEV) /* Probably this is an MTD character device node path */ mtd = open_mtd_by_chdev(mtd_dev); } else mtd = get_mtd_device(NULL, mtd_num); return mtd; } static void ubi_notify_add(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); int err; if (!of_device_is_compatible(np, "linux,ubi")) return; /* * we are already holding &mtd_table_mutex, but still need * to bump refcount */ err = __get_mtd_device(mtd); if (err) return; /* called while holding mtd_table_mutex */ mutex_lock_nested(&ubi_devices_mutex, SINGLE_DEPTH_NESTING); err = ubi_attach_mtd_dev(mtd, UBI_DEV_NUM_AUTO, 0, 0, false, false); mutex_unlock(&ubi_devices_mutex); if (err < 0) __put_mtd_device(mtd); } static void ubi_notify_remove(struct mtd_info *mtd) { /* do nothing for now */ } static struct mtd_notifier ubi_mtd_notifier = { .add = ubi_notify_add, .remove = ubi_notify_remove, }; static int __init ubi_init_attach(void) { int err, i, k; /* Attach MTD devices */ for (i = 0; i < mtd_devs; i++) { struct mtd_dev_param *p = &mtd_dev_param[i]; struct mtd_info *mtd; cond_resched(); mtd = open_mtd_device(p->name); if (IS_ERR(mtd)) { err = PTR_ERR(mtd); pr_err("UBI error: cannot open mtd %s, error %d\n", p->name, err); /* See comment below re-ubi_is_module(). */ if (ubi_is_module()) goto out_detach; continue; } mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, p->ubi_num, p->vid_hdr_offs, p->max_beb_per1024, p->enable_fm == 0, p->need_resv_pool != 0); mutex_unlock(&ubi_devices_mutex); if (err < 0) { pr_err("UBI error: cannot attach mtd%d\n", mtd->index); put_mtd_device(mtd); /* * Originally UBI stopped initializing on any error. * However, later on it was found out that this * behavior is not very good when UBI is compiled into * the kernel and the MTD devices to attach are passed * through the command line. Indeed, UBI failure * stopped whole boot sequence. * * To fix this, we changed the behavior for the * non-module case, but preserved the old behavior for * the module case, just for compatibility. This is a * little inconsistent, though. */ if (ubi_is_module()) goto out_detach; } } return 0; out_detach: for (k = 0; k < i; k++) if (ubi_devices[k]) { mutex_lock(&ubi_devices_mutex); ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } return err; } #ifndef CONFIG_MTD_UBI_MODULE late_initcall(ubi_init_attach); #endif static int __init ubi_init(void) { int err; /* Ensure that EC and VID headers have correct size */ BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); if (mtd_devs > UBI_MAX_DEVICES) { pr_err("UBI error: too many MTD devices, maximum is %d\n", UBI_MAX_DEVICES); return -EINVAL; } /* Create base sysfs directory and sysfs files */ err = class_register(&ubi_class); if (err < 0) return err; err = misc_register(&ubi_ctrl_cdev); if (err) { pr_err("UBI error: cannot register device\n"); goto out; } ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", sizeof(struct ubi_wl_entry), 0, 0, NULL); if (!ubi_wl_entry_slab) { err = -ENOMEM; goto out_dev_unreg; } err = ubi_debugfs_init(); if (err) goto out_slab; err = ubiblock_init(); if (err) { pr_err("UBI error: block: cannot initialize, error %d\n", err); /* See comment above re-ubi_is_module(). */ if (ubi_is_module()) goto out_debugfs; } register_mtd_user(&ubi_mtd_notifier); if (ubi_is_module()) { err = ubi_init_attach(); if (err) goto out_mtd_notifier; } return 0; out_mtd_notifier: unregister_mtd_user(&ubi_mtd_notifier); ubiblock_exit(); out_debugfs: ubi_debugfs_exit(); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: misc_deregister(&ubi_ctrl_cdev); out: class_unregister(&ubi_class); pr_err("UBI error: cannot initialize UBI, error %d\n", err); return err; } device_initcall(ubi_init); static void __exit ubi_exit(void) { int i; ubiblock_exit(); unregister_mtd_user(&ubi_mtd_notifier); for (i = 0; i < UBI_MAX_DEVICES; i++) if (ubi_devices[i]) { mutex_lock(&ubi_devices_mutex); ubi_detach_mtd_dev(ubi_devices[i]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } ubi_debugfs_exit(); kmem_cache_destroy(ubi_wl_entry_slab); misc_deregister(&ubi_ctrl_cdev); class_unregister(&ubi_class); } module_exit(ubi_exit); /** * bytes_str_to_int - convert a number of bytes string into an integer. * @str: the string to convert * * This function returns positive resulting integer in case of success and a * negative error code in case of failure. */ static int bytes_str_to_int(const char *str) { char *endp; unsigned long result; result = simple_strtoul(str, &endp, 0); if (str == endp || result >= INT_MAX) { pr_err("UBI error: incorrect bytes count: \"%s\"\n", str); return -EINVAL; } switch (*endp) { case 'G': result *= 1024; fallthrough; case 'M': result *= 1024; fallthrough; case 'K': result *= 1024; break; case '\0': break; default: pr_err("UBI error: incorrect bytes count: \"%s\"\n", str); return -EINVAL; } return result; } /** * ubi_mtd_param_parse - parse the 'mtd=' UBI parameter. * @val: the parameter value to parse * @kp: not used * * This function returns zero in case of success and a negative error code in * case of error. */ static int ubi_mtd_param_parse(const char *val, const struct kernel_param *kp) { int i, len; struct mtd_dev_param *p; char buf[MTD_PARAM_LEN_MAX]; char *pbuf = &buf[0]; char *tokens[MTD_PARAM_MAX_COUNT], *token; if (!val) return -EINVAL; if (mtd_devs == UBI_MAX_DEVICES) { pr_err("UBI error: too many parameters, max. is %d\n", UBI_MAX_DEVICES); return -EINVAL; } len = strnlen(val, MTD_PARAM_LEN_MAX); if (len == MTD_PARAM_LEN_MAX) { pr_err("UBI error: parameter \"%s\" is too long, max. is %d\n", val, MTD_PARAM_LEN_MAX); return -EINVAL; } if (len == 0) { pr_warn("UBI warning: empty 'mtd=' parameter - ignored\n"); return 0; } strcpy(buf, val); /* Get rid of the final newline */ if (buf[len - 1] == '\n') buf[len - 1] = '\0'; for (i = 0; i < MTD_PARAM_MAX_COUNT; i++) tokens[i] = strsep(&pbuf, ","); if (pbuf) { pr_err("UBI error: too many arguments at \"%s\"\n", val); return -EINVAL; } p = &mtd_dev_param[mtd_devs]; strcpy(&p->name[0], tokens[0]); token = tokens[1]; if (token) { p->vid_hdr_offs = bytes_str_to_int(token); if (p->vid_hdr_offs < 0) return p->vid_hdr_offs; } token = tokens[2]; if (token) { int err = kstrtoint(token, 10, &p->max_beb_per1024); if (err) { pr_err("UBI error: bad value for max_beb_per1024 parameter: %s\n", token); return -EINVAL; } } token = tokens[3]; if (token) { int err = kstrtoint(token, 10, &p->ubi_num); if (err || p->ubi_num < UBI_DEV_NUM_AUTO) { pr_err("UBI error: bad value for ubi_num parameter: %s\n", token); return -EINVAL; } } else p->ubi_num = UBI_DEV_NUM_AUTO; token = tokens[4]; if (token) { int err = kstrtoint(token, 10, &p->enable_fm); if (err) { pr_err("UBI error: bad value for enable_fm parameter: %s\n", token); return -EINVAL; } } else p->enable_fm = 0; token = tokens[5]; if (token) { int err = kstrtoint(token, 10, &p->need_resv_pool); if (err) { pr_err("UBI error: bad value for need_resv_pool parameter: %s\n", token); return -EINVAL; } } else p->need_resv_pool = 0; mtd_devs += 1; return 0; } module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 0400); MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: mtd=<name|num|path>[,<vid_hdr_offs>[,max_beb_per1024[,ubi_num]]].\n" "Multiple \"mtd\" parameters may be specified.\n" "MTD devices may be specified by their number, name, or path to the MTD character device node.\n" "Optional \"vid_hdr_offs\" parameter specifies UBI VID header position to be used by UBI. (default value if 0)\n" "Optional \"max_beb_per1024\" parameter specifies the maximum expected bad eraseblock per 1024 eraseblocks. (default value (" __stringify(CONFIG_MTD_UBI_BEB_LIMIT) ") if 0)\n" "Optional \"ubi_num\" parameter specifies UBI device number which have to be assigned to the newly created UBI device (assigned automatically by default)\n" "Optional \"enable_fm\" parameter determines whether to enable fastmap during attach. If the value is non-zero, fastmap is enabled. Default value is 0.\n" "Optional \"need_resv_pool\" parameter determines whether to reserve pool->max_size pebs during attach. If the value is non-zero, peb reservation is enabled. Default value is 0.\n" "\n" "Example 1: mtd=/dev/mtd0 - attach MTD device /dev/mtd0.\n" "Example 2: mtd=content,1984 mtd=4 - attach MTD device with name \"content\" using VID header offset 1984, and MTD device number 4 with default VID header offset.\n" "Example 3: mtd=/dev/mtd1,0,25 - attach MTD device /dev/mtd1 using default VID header offset and reserve 25*nand_size_in_blocks/1024 erase blocks for bad block handling.\n" "Example 4: mtd=/dev/mtd1,0,0,5 - attach MTD device /dev/mtd1 to UBI 5 and using default values for the other fields.\n" "example 5: mtd=1,0,0,5 mtd=2,0,0,6,1 - attach MTD device /dev/mtd1 to UBI 5 and disable fastmap; attach MTD device /dev/mtd2 to UBI 6 and enable fastmap.(only works when fastmap is enabled and fm_autoconvert=Y).\n" "\t(e.g. if the NAND *chipset* has 4096 PEB, 100 will be reserved for this UBI device)."); #ifdef CONFIG_MTD_UBI_FASTMAP module_param(fm_autoconvert, bool, 0644); MODULE_PARM_DESC(fm_autoconvert, "Set this parameter to enable fastmap automatically on images without a fastmap."); module_param(fm_debug, bool, 0); MODULE_PARM_DESC(fm_debug, "Set this parameter to enable fastmap debugging by default. Warning, this will make fastmap slow!"); #endif MODULE_VERSION(__stringify(UBI_VERSION)); MODULE_DESCRIPTION("UBI - Unsorted Block Images"); MODULE_AUTHOR("Artem Bityutskiy"); MODULE_LICENSE("GPL");
99 158 223 35 55 148 230 1 275 153 2 1 89 189 97 98 4 185 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET Generic infrastructure for INET connection oriented protocols. * * Definitions for inet_connection_sock * * Authors: Many people, see the TCP sources * * From code originally in TCP */ #ifndef _INET_CONNECTION_SOCK_H #define _INET_CONNECTION_SOCK_H #include <linux/compiler.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/poll.h> #include <linux/kernel.h> #include <linux/sockptr.h> #include <net/inet_sock.h> #include <net/request_sock.h> /* Cancel timers, when they are not required. */ #undef INET_CSK_CLEAR_TIMERS struct inet_bind_bucket; struct inet_bind2_bucket; struct tcp_congestion_ops; /* * Pointers to address related TCP functions * (i.e. things that depend on the address family) */ struct inet_connection_sock_af_ops { int (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl); void (*send_check)(struct sock *sk, struct sk_buff *skb); int (*rebuild_header)(struct sock *sk); void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); int (*conn_request)(struct sock *sk, struct sk_buff *skb); struct sock *(*syn_recv_sock)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); u16 net_header_len; int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); void (*mtu_reduced)(struct sock *sk); }; /** inet_connection_sock - INET connection oriented sock * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event * @icsk_backoff: Backoff * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries * @icsk_probes_out: unanswered 0 window probes * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data * @icsk_mtup; MTU probing control data * @icsk_probes_tstamp: Probe timestamp (cleared by non-zero window ack) * @icsk_user_timeout: TCP_USER_TIMEOUT value */ struct inet_connection_sock { /* inet_sock has to be the first member! */ struct inet_sock icsk_inet; struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; struct inet_bind2_bucket *icsk_bind2_hash; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; __u32 icsk_rto_min; u32 icsk_rto_max; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; __u8 icsk_pending; __u8 icsk_backoff; __u8 icsk_syn_retries; __u8 icsk_probes_out; __u16 icsk_ext_hdr_len; struct { __u8 pending; /* ACK is pending */ __u8 quick; /* Scheduled number of quick acks */ __u8 pingpong; /* The session is interactive */ __u8 retry; /* Number of attempts */ #define ATO_BITS 8 __u32 ato:ATO_BITS, /* Predicted tick of soft clock */ lrcv_flowlabel:20, /* last received ipv6 flowlabel */ dst_quick_ack:1, /* cache dst RTAX_QUICKACK */ unused:3; __u32 lrcvtime; /* timestamp of last received data packet */ __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; struct { /* Range of MTUs to search */ int search_high; int search_low; /* Information on the current probe. */ u32 probe_size:31, /* Is the MTUP feature enabled for this connection? */ enabled:1; u32 probe_timestamp; } icsk_mtup; u32 icsk_probes_tstamp; u32 icsk_user_timeout; u64 icsk_ca_priv[104 / sizeof(u64)]; #define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ #define ICSK_TIME_DACK 2 /* Delayed ack timer */ #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ #define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */ #define inet_csk(ptr) container_of_const(ptr, struct inet_connection_sock, icsk_inet.sk) static inline void *inet_csk_ca(const struct sock *sk) { return (void *)inet_csk(sk)->icsk_ca_priv; } struct sock *inet_csk_clone_lock(const struct sock *sk, const struct request_sock *req, const gfp_t priority); enum inet_csk_ack_state_t { ICSK_ACK_SCHED = 1, ICSK_ACK_TIMER = 2, ICSK_ACK_PUSHED = 4, ICSK_ACK_PUSHED2 = 8, ICSK_ACK_NOW = 16, /* Send the next ACK immediately (once) */ ICSK_ACK_NOMEM = 32, }; void inet_csk_init_xmit_timers(struct sock *sk, void (*retransmit_handler)(struct timer_list *), void (*delack_handler)(struct timer_list *), void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); void inet_csk_clear_xmit_timers_sync(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) { inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_SCHED; } static inline int inet_csk_ack_scheduled(const struct sock *sk) { return inet_csk(sk)->icsk_ack.pending & ICSK_ACK_SCHED; } static inline void inet_csk_delack_init(struct sock *sk) { memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } static inline unsigned long icsk_timeout(const struct inet_connection_sock *icsk) { return READ_ONCE(icsk->icsk_retransmit_timer.expires); } static inline unsigned long icsk_delack_timeout(const struct inet_connection_sock *icsk) { return READ_ONCE(icsk->icsk_delack_timer.expires); } static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, 0); icsk->icsk_ack.retry = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); #endif } else { pr_debug("inet_csk BUG: unknown timer value\n"); } } /* * Reset the retransmission timer */ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when) { struct inet_connection_sock *icsk = inet_csk(sk); if (when > max_when) { pr_debug("reset_xmit_timer: sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, (void *)_THIS_IP_); when = max_when; } when += jiffies; if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { smp_store_release(&icsk->icsk_pending, what); sk_reset_timer(sk, &icsk->icsk_retransmit_timer, when); } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_TIMER); sk_reset_timer(sk, &icsk->icsk_delack_timer, when); } else { pr_debug("inet_csk BUG: unknown timer value\n"); } } static inline unsigned long inet_csk_rto_backoff(const struct inet_connection_sock *icsk, unsigned long max_when) { u64 when = (u64)icsk->icsk_rto << icsk->icsk_backoff; return (unsigned long)min_t(u64, when, max_when); } struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg); int inet_csk_get_port(struct sock *sk, unsigned short snum); struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, const struct request_sock *req); struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req); static inline void inet_csk_reqsk_queue_added(struct sock *sk) { reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue); } static inline int inet_csk_reqsk_queue_len(const struct sock *sk) { return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue); } static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); static inline unsigned long reqsk_timeout(struct request_sock *req, unsigned long max_timeout) { u64 timeout = (u64)req->timeout << req->num_timeout; return (unsigned long)min_t(u64, timeout, max_timeout); } void inet_csk_destroy_sock(struct sock *sk); void inet_csk_prepare_for_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); /* * LISTEN is a special case for poll.. */ static inline __poll_t inet_csk_listen_poll(const struct sock *sk) { return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (EPOLLIN | EPOLLRDNORM) : 0; } int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); /* update the fast reuse flag when adding a socket */ void inet_csk_update_fastreuse(const struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2); struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); static inline void inet_csk_enter_pingpong_mode(struct sock *sk) { inet_csk(sk)->icsk_ack.pingpong = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh); } static inline void inet_csk_exit_pingpong_mode(struct sock *sk) { inet_csk(sk)->icsk_ack.pingpong = 0; } static inline bool inet_csk_in_pingpong_mode(struct sock *sk) { return inet_csk(sk)->icsk_ack.pingpong >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh); } static inline void inet_csk_inc_pingpong_cnt(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.pingpong < U8_MAX) icsk->icsk_ack.pingpong++; } static inline bool inet_csk_has_ulp(const struct sock *sk) { return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; } static inline void inet_init_csk_locks(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); } #endif /* _INET_CONNECTION_SOCK_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56