| 19 3 18 18 18 18 18 19 19 3 18 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. * Lennox Wu <lennox.wu@sunplusct.com> * Chen Liqin <liqin.chen@sunplusct.com> * Copyright (C) 2013 Regents of the University of California */ #include <linux/bitfield.h> #include <linux/extable.h> #include <linux/module.h> #include <linux/uaccess.h> #include <asm/asm-extable.h> #include <asm/ptrace.h> static inline unsigned long get_ex_fixup(const struct exception_table_entry *ex) { return ((unsigned long)&ex->fixup + ex->fixup); } static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs) { regs->epc = get_ex_fixup(ex); return true; } static inline unsigned long regs_get_gpr(struct pt_regs *regs, unsigned int offset) { if (unlikely(!offset || offset > MAX_REG_OFFSET)) return 0; return *(unsigned long *)((unsigned long)regs + offset); } static inline void regs_set_gpr(struct pt_regs *regs, unsigned int offset, unsigned long val) { if (unlikely(offset > MAX_REG_OFFSET)) return; if (offset) *(unsigned long *)((unsigned long)regs + offset) = val; } static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, struct pt_regs *regs) { int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); int reg_zero = FIELD_GET(EX_DATA_REG_ZERO, ex->data); regs_set_gpr(regs, reg_err * sizeof(unsigned long), -EFAULT); regs_set_gpr(regs, reg_zero * sizeof(unsigned long), 0); regs->epc = get_ex_fixup(ex); return true; } static bool ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) { int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->data); int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); unsigned long data, addr, offset; addr = regs_get_gpr(regs, reg_addr * sizeof(unsigned long)); offset = addr & 0x7UL; addr &= ~0x7UL; data = *(unsigned long *)addr >> (offset * 8); regs_set_gpr(regs, reg_data * sizeof(unsigned long), data); regs->epc = get_ex_fixup(ex); return true; } bool fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *ex; ex = search_exception_tables(regs->epc); if (!ex) return false; switch (ex->type) { case EX_TYPE_FIXUP: return ex_handler_fixup(ex, regs); case EX_TYPE_BPF: return ex_handler_bpf(ex, regs); case EX_TYPE_UACCESS_ERR_ZERO: return ex_handler_uaccess_err_zero(ex, regs); case EX_TYPE_LOAD_UNALIGNED_ZEROPAD: return ex_handler_load_unaligned_zeropad(ex, regs); } BUG(); } |
| 684 685 684 687 682 685 684 688 684 684 683 203 203 203 203 203 203 203 203 203 687 203 498 203 115 163 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup(map->forward, map->nr_extents * sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/sched.h> #include <linux/random.h> #include <linux/sbitmap.h> #include <linux/seq_file.h> static int init_alloc_hint(struct sbitmap *sb, gfp_t flags) { unsigned depth = sb->depth; sb->alloc_hint = alloc_percpu_gfp(unsigned int, flags); if (!sb->alloc_hint) return -ENOMEM; if (depth && !sb->round_robin) { int i; for_each_possible_cpu(i) *per_cpu_ptr(sb->alloc_hint, i) = get_random_u32_below(depth); } return 0; } static inline unsigned update_alloc_hint_before_get(struct sbitmap *sb, unsigned int depth) { unsigned hint; hint = this_cpu_read(*sb->alloc_hint); if (unlikely(hint >= depth)) { hint = depth ? get_random_u32_below(depth) : 0; this_cpu_write(*sb->alloc_hint, hint); } return hint; } static inline void update_alloc_hint_after_get(struct sbitmap *sb, unsigned int depth, unsigned int hint, unsigned int nr) { if (nr == -1) { /* If the map is full, a hint won't do us much good. */ this_cpu_write(*sb->alloc_hint, 0); } else if (nr == hint || unlikely(sb->round_robin)) { /* Only update the hint if we used it. */ hint = nr + 1; if (hint >= depth - 1) hint = 0; this_cpu_write(*sb->alloc_hint, hint); } } /* * See if we have deferred clears that we can batch move */ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) { unsigned long mask; if (!READ_ONCE(map->cleared)) return false; /* * First get a stable cleared mask, setting the old mask to 0. */ mask = xchg(&map->cleared, 0); /* * Now clear the masked bits in our free word */ atomic_long_andnot(mask, (atomic_long_t *)&map->word); BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word)); return true; } int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node, bool round_robin, bool alloc_hint) { unsigned int bits_per_word; if (shift < 0) shift = sbitmap_calculate_shift(depth); bits_per_word = 1U << shift; if (bits_per_word > BITS_PER_LONG) return -EINVAL; sb->shift = shift; sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); sb->round_robin = round_robin; if (depth == 0) { sb->map = NULL; return 0; } if (alloc_hint) { if (init_alloc_hint(sb, flags)) return -ENOMEM; } else { sb->alloc_hint = NULL; } sb->map = kvzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); if (!sb->map) { free_percpu(sb->alloc_hint); return -ENOMEM; } return 0; } EXPORT_SYMBOL_GPL(sbitmap_init_node); void sbitmap_resize(struct sbitmap *sb, unsigned int depth) { unsigned int bits_per_word = 1U << sb->shift; unsigned int i; for (i = 0; i < sb->map_nr; i++) sbitmap_deferred_clear(&sb->map[i]); sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); } EXPORT_SYMBOL_GPL(sbitmap_resize); static int __sbitmap_get_word(unsigned long *word, unsigned long depth, unsigned int hint, bool wrap) { int nr; /* don't wrap if starting from 0 */ wrap = wrap && hint; while (1) { nr = find_next_zero_bit(word, depth, hint); if (unlikely(nr >= depth)) { /* * We started with an offset, and we didn't reset the * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ if (hint && wrap) { hint = 0; continue; } return -1; } if (!test_and_set_bit_lock(nr, word)) break; hint = nr + 1; if (hint >= depth - 1) hint = 0; } return nr; } static int sbitmap_find_bit_in_word(struct sbitmap_word *map, unsigned int depth, unsigned int alloc_hint, bool wrap) { int nr; do { nr = __sbitmap_get_word(&map->word, depth, alloc_hint, wrap); if (nr != -1) break; if (!sbitmap_deferred_clear(map)) break; } while (1); return nr; } static int sbitmap_find_bit(struct sbitmap *sb, unsigned int depth, unsigned int index, unsigned int alloc_hint, bool wrap) { unsigned int i; int nr = -1; for (i = 0; i < sb->map_nr; i++) { nr = sbitmap_find_bit_in_word(&sb->map[index], min_t(unsigned int, __map_depth(sb, index), depth), alloc_hint, wrap); if (nr != -1) { nr += index << sb->shift; break; } /* Jump to next index. */ alloc_hint = 0; if (++index >= sb->map_nr) index = 0; } return nr; } static int __sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); /* * Unless we're doing round robin tag allocation, just use the * alloc_hint to find the right word index. No point in looping * twice in find_next_zero_bit() for that case. */ if (sb->round_robin) alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); else alloc_hint = 0; return sbitmap_find_bit(sb, UINT_MAX, index, alloc_hint, !sb->round_robin); } int sbitmap_get(struct sbitmap *sb) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get(sb, hint); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get); static int __sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth) { unsigned int index; index = SB_NR_TO_INDEX(sb, alloc_hint); alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true); } int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth) { int nr; unsigned int hint, depth; if (WARN_ON_ONCE(unlikely(!sb->alloc_hint))) return -1; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); nr = __sbitmap_get_shallow(sb, hint, shallow_depth); update_alloc_hint_after_get(sb, depth, hint, nr); return nr; } EXPORT_SYMBOL_GPL(sbitmap_get_shallow); bool sbitmap_any_bit_set(const struct sbitmap *sb) { unsigned int i; for (i = 0; i < sb->map_nr; i++) { if (sb->map[i].word & ~sb->map[i].cleared) return true; } return false; } EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) { unsigned int i, weight = 0; for (i = 0; i < sb->map_nr; i++) { const struct sbitmap_word *word = &sb->map[i]; unsigned int word_depth = __map_depth(sb, i); if (set) weight += bitmap_weight(&word->word, word_depth); else weight += bitmap_weight(&word->cleared, word_depth); } return weight; } static unsigned int sbitmap_cleared(const struct sbitmap *sb) { return __sbitmap_weight(sb, false); } unsigned int sbitmap_weight(const struct sbitmap *sb) { return __sbitmap_weight(sb, true) - sbitmap_cleared(sb); } EXPORT_SYMBOL_GPL(sbitmap_weight); void sbitmap_show(struct sbitmap *sb, struct seq_file *m) { seq_printf(m, "depth=%u\n", sb->depth); seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); seq_printf(m, "map_nr=%u\n", sb->map_nr); } EXPORT_SYMBOL_GPL(sbitmap_show); static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) { if ((offset & 0xf) == 0) { if (offset != 0) seq_putc(m, '\n'); seq_printf(m, "%08x:", offset); } if ((offset & 0x1) == 0) seq_putc(m, ' '); seq_printf(m, "%02x", byte); } void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) { u8 byte = 0; unsigned int byte_bits = 0; unsigned int offset = 0; int i; for (i = 0; i < sb->map_nr; i++) { unsigned long word = READ_ONCE(sb->map[i].word); unsigned long cleared = READ_ONCE(sb->map[i].cleared); unsigned int word_bits = __map_depth(sb, i); word &= ~cleared; while (word_bits > 0) { unsigned int bits = min(8 - byte_bits, word_bits); byte |= (word & (BIT(bits) - 1)) << byte_bits; byte_bits += bits; if (byte_bits == 8) { emit_byte(m, offset, byte); byte = 0; byte_bits = 0; offset++; } word >>= bits; word_bits -= bits; } } if (byte_bits) { emit_byte(m, offset, byte); offset++; } if (offset) seq_putc(m, '\n'); } EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; unsigned int shallow_depth; /* * Each full word of the bitmap has bits_per_word bits, and there might * be a partial word. There are depth / bits_per_word full words and * depth % bits_per_word bits left over. In bitwise arithmetic: * * bits_per_word = 1 << shift * depth / bits_per_word = depth >> shift * depth % bits_per_word = depth & ((1 << shift) - 1) * * Each word can be limited to sbq->min_shallow_depth bits. */ shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth); depth = ((depth >> sbq->sb.shift) * shallow_depth + min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth)); wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); return wake_batch; } int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node) { int ret; int i; ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node, round_robin, true); if (ret) return ret; sbq->min_shallow_depth = UINT_MAX; sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); atomic_set(&sbq->ws_active, 0); atomic_set(&sbq->completion_cnt, 0); atomic_set(&sbq->wakeup_cnt, 0); sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { sbitmap_free(&sbq->sb); return -ENOMEM; } for (i = 0; i < SBQ_WAIT_QUEUES; i++) init_waitqueue_head(&sbq->ws[i].wait); return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch; wake_batch = sbq_calc_wake_batch(sbq, depth); if (sbq->wake_batch != wake_batch) WRITE_ONCE(sbq->wake_batch, wake_batch); } void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, unsigned int users) { unsigned int wake_batch; unsigned int depth = (sbq->sb.depth + users - 1) / users; wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, 1, SBQ_WAKE_BATCH); WRITE_ONCE(sbq->wake_batch, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { sbitmap_queue_update_wake_batch(sbq, depth); sbitmap_resize(&sbq->sb, depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_resize); int __sbitmap_queue_get(struct sbitmap_queue *sbq) { return sbitmap_get(&sbq->sb); } EXPORT_SYMBOL_GPL(__sbitmap_queue_get); unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int *offset) { struct sbitmap *sb = &sbq->sb; unsigned int hint, depth; unsigned long index, nr; int i; if (unlikely(sb->round_robin)) return 0; depth = READ_ONCE(sb->depth); hint = update_alloc_hint_before_get(sb, depth); index = SB_NR_TO_INDEX(sb, hint); for (i = 0; i < sb->map_nr; i++) { struct sbitmap_word *map = &sb->map[index]; unsigned long get_mask; unsigned int map_depth = __map_depth(sb, index); sbitmap_deferred_clear(map); if (map->word == (1UL << (map_depth - 1)) - 1) goto next; nr = find_first_zero_bit(&map->word, map_depth); if (nr + nr_tags <= map_depth) { atomic_long_t *ptr = (atomic_long_t *) &map->word; unsigned long val; get_mask = ((1UL << nr_tags) - 1) << nr; val = READ_ONCE(map->word); while (!atomic_long_try_cmpxchg(ptr, &val, get_mask | val)) ; get_mask = (get_mask & ~val) >> nr; if (get_mask) { *offset = nr + (index << sb->shift); update_alloc_hint_after_get(sb, depth, hint, *offset + nr_tags - 1); return get_mask; } } next: /* Jump to next index. */ if (++index >= sb->map_nr) index = 0; } return 0; } int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth) { WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); return sbitmap_get_shallow(&sbq->sb, shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_get_shallow); void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth) { sbq->min_shallow_depth = min_shallow_depth; sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { int i, wake_index, woken; if (!atomic_read(&sbq->ws_active)) return; wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; /* * Advance the index before checking the current queue. * It improves fairness, by ensuring the queue doesn't * need to be fully emptied before trying to wake up * from the next one. */ wake_index = sbq_index_inc(wake_index); if (waitqueue_active(&ws->wait)) { woken = wake_up_nr(&ws->wait, nr); if (woken == nr) break; nr -= woken; } } if (wake_index != atomic_read(&sbq->wake_index)) atomic_set(&sbq->wake_index, wake_index); } void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { unsigned int wake_batch = READ_ONCE(sbq->wake_batch); unsigned int wakeups; if (!atomic_read(&sbq->ws_active)) return; atomic_add(nr, &sbq->completion_cnt); wakeups = atomic_read(&sbq->wakeup_cnt); do { if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) return; } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, &wakeups, wakeups + wake_batch)); __sbitmap_queue_wake_up(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); static inline void sbitmap_update_cpu_hint(struct sbitmap *sb, int cpu, int tag) { if (likely(!sb->round_robin && tag < sb->depth)) data_race(*per_cpu_ptr(sb->alloc_hint, cpu) = tag); } void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset, int *tags, int nr_tags) { struct sbitmap *sb = &sbq->sb; unsigned long *addr = NULL; unsigned long mask = 0; int i; smp_mb__before_atomic(); for (i = 0; i < nr_tags; i++) { const int tag = tags[i] - offset; unsigned long *this_addr; /* since we're clearing a batch, skip the deferred map */ this_addr = &sb->map[SB_NR_TO_INDEX(sb, tag)].word; if (!addr) { addr = this_addr; } else if (addr != this_addr) { atomic_long_andnot(mask, (atomic_long_t *) addr); mask = 0; addr = this_addr; } mask |= (1UL << SB_NR_TO_BIT(sb, tag)); } if (mask) atomic_long_andnot(mask, (atomic_long_t *) addr); smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, nr_tags); sbitmap_update_cpu_hint(&sbq->sb, raw_smp_processor_id(), tags[nr_tags - 1] - offset); } void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu) { /* * Once the clear bit is set, the bit may be allocated out. * * Orders READ/WRITE on the associated instance(such as request * of blk_mq) by this bit for avoiding race with re-allocation, * and its pair is the memory barrier implied in __sbitmap_get_word. * * One invariant is that the clear bit has to be zero when the bit * is in use. */ smp_mb__before_atomic(); sbitmap_deferred_clear_bit(&sbq->sb, nr); /* * Pairs with the memory barrier in set_current_state() to ensure the * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the * waiter. See the comment on waitqueue_active(). */ smp_mb__after_atomic(); sbitmap_queue_wake_up(sbq, 1); sbitmap_update_cpu_hint(&sbq->sb, cpu, nr); } EXPORT_SYMBOL_GPL(sbitmap_queue_clear); void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) { int i, wake_index; /* * Pairs with the memory barrier in set_current_state() like in * sbitmap_queue_wake_up(). */ smp_mb(); wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index]; if (waitqueue_active(&ws->wait)) wake_up(&ws->wait); wake_index = sbq_index_inc(wake_index); } } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) { bool first; int i; sbitmap_show(&sbq->sb, m); seq_puts(m, "alloc_hint={"); first = true; for_each_possible_cpu(i) { if (!first) seq_puts(m, ", "); first = false; seq_printf(m, "%u", *per_cpu_ptr(sbq->sb.alloc_hint, i)); } seq_puts(m, "}\n"); seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[i]; seq_printf(m, "\t{.wait=%s},\n", waitqueue_active(&ws->wait) ? "active" : "inactive"); } seq_puts(m, "}\n"); seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin); seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); } EXPORT_SYMBOL_GPL(sbitmap_queue_show); void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { if (!sbq_wait->sbq) { sbq_wait->sbq = sbq; atomic_inc(&sbq->ws_active); add_wait_queue(&ws->wait, &sbq_wait->wait); } } EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) { list_del_init(&sbq_wait->wait.entry); if (sbq_wait->sbq) { atomic_dec(&sbq_wait->sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state) { if (!sbq_wait->sbq) { atomic_inc(&sbq->ws_active); sbq_wait->sbq = sbq; } prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); } EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait) { finish_wait(&ws->wait, &sbq_wait->wait); if (sbq_wait->sbq) { atomic_dec(&sbq->ws_active); sbq_wait->sbq = NULL; } } EXPORT_SYMBOL_GPL(sbitmap_finish_wait); |
| 43 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | // SPDX-License-Identifier: GPL-2.0-only #include <net/tcp.h> /* The bandwidth estimator estimates the rate at which the network * can currently deliver outbound data packets for this flow. At a high * level, it operates by taking a delivery rate sample for each ACK. * * A rate sample records the rate at which the network delivered packets * for this flow, calculated over the time interval between the transmission * of a data packet and the acknowledgment of that packet. * * Specifically, over the interval between each transmit and corresponding ACK, * the estimator generates a delivery rate sample. Typically it uses the rate * at which packets were acknowledged. However, the approach of using only the * acknowledgment rate faces a challenge under the prevalent ACK decimation or * compression: packets can temporarily appear to be delivered much quicker * than the bottleneck rate. Since it is physically impossible to do that in a * sustained fashion, when the estimator notices that the ACK rate is faster * than the transmit rate, it uses the latter: * * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) * bw = min(send_rate, ack_rate) * * Notice the estimator essentially estimates the goodput, not always the * network bottleneck link rate when the sending or receiving is limited by * other factors like applications or receiver window limits. The estimator * deliberately avoids using the inter-packet spacing approach because that * approach requires a large number of samples and sophisticated filtering. * * TCP flows can often be application-limited in request/response workloads. * The estimator marks a bandwidth sample as application-limited if there * was some moment during the sampled window of packets when there was no data * ready to send in the write queue. */ /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* In general we need to start delivery rate samples from the * time we received the most recent ACK, to ensure we include * the full time the network needs to deliver all in-flight * packets. If there are no packets in flight yet, then we * know that any ACKs after now indicate that the network was * able to deliver those packets completely in the sampling * interval between now and the next ACK. * * Note that we use packets_out instead of tcp_packets_in_flight(tp) * because the latter is a guess based on RTO and loss-marking * heuristics. We don't want spurious RTOs or loss markings to cause * a spuriously small time interval, causing a spuriously high * bandwidth estimate. */ if (!tp->packets_out) { u64 tstamp_us = tcp_skb_timestamp_us(skb); tp->first_tx_mstamp = tstamp_us; tp->delivered_mstamp = tstamp_us; } TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) * delivery information when the skb was last transmitted. * * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is * called multiple times. We favor the information from the most recently * sent skb, i.e., the skb with the most recently sent time and the highest * sequence. */ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u64 tx_tstamp; if (!scb->tx.delivered_mstamp) return; tx_tstamp = tcp_skb_timestamp_us(skb); if (!rs->prior_delivered || tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, scb->end_seq, rs->last_end_seq)) { rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; rs->last_end_seq = scb->end_seq; /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tx_tstamp; /* Find the duration of the "send phase" of this window: */ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, scb->tx.first_tx_mstamp); } /* Mark off the skb delivered once it's sacked to avoid being * used again when it's cumulatively acked. For acked packets * we don't need to reset since it'll be freed soon. */ if (scb->sacked & TCPCB_SACKED_ACKED) scb->tx.delivered_mstamp = 0; } /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; /* Clear app limited if bubble is acked and gone. */ if (tp->app_limited && after(tp->delivered, tp->app_limited)) tp->app_limited = 0; /* TODO: there are multiple places throughout tcp_ack() to get * current time. Refactor the code using a new "tcp_acktag_state" * to carry current time, flags, stats like "tcp_sacktag_state". */ if (delivered) tp->delivered_mstamp = tp->tcp_mstamp; rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ /* Return an invalid sample if no timing information is available or * in recovery from loss with SACK reneging. Rate samples taken during * a SACK reneging event may overestimate bw by including packets that * were SACKed before the reneg. */ if (!rs->prior_mstamp || is_sack_reneg) { rs->delivered = -1; rs->interval_us = -1; return; } rs->delivered = tp->delivered - rs->prior_delivered; rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; /* delivered_ce occupies less than 32 bits in the skb control block */ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; /* Model sending data and receiving ACKs as separate pipeline phases * for a window. Usually the ACK phase is longer, but with ACK * compression the send phase can be longer. To be safe we use the * longer phase. */ snd_us = rs->interval_us; /* send phase */ ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Record both segment send and ack receive intervals */ rs->snd_interval_us = snd_us; rs->rcv_interval_us = ack_us; /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" * is under-estimated (up to an RTT). However continuously * measuring the delivery rate during loss recovery is crucial * for connections suffer heavy or prolonged losses. */ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { if (!rs->is_retrans) pr_debug("tcp rate: %ld %d %u %u %u\n", rs->interval_us, rs->delivered, inet_csk(sk)->icsk_ca_state, tp->rx_opt.sack_ok, tcp_min_rtt(tp)); rs->interval_us = -1; return; } /* Record the last non-app-limited or the highest app-limited bw */ if (!rs->is_app_limited || ((u64)rs->delivered * tp->rate_interval_us >= (u64)tp->rate_delivered * rs->interval_us)) { tp->rate_delivered = rs->delivered; tp->rate_interval_us = rs->interval_us; tp->rate_app_limited = rs->is_app_limited; } } /* If a gap is detected between sends, mark the socket application-limited. */ void tcp_rate_check_app_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (/* We have less than one packet to send. */ tp->write_seq - tp->snd_nxt < tp->mss_cache && /* Nothing in sending host's qdisc queues or NIC tx queue. */ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && /* We are not limited by CWND. */ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && /* All lost packets have been retransmitted. */ tp->lost_out <= tp->retrans_out) tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; } EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); |
| 510 51 57 57 125 386 335 1150 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H #include <linux/kernel.h> #include <linux/preempt.h> #include <linux/atomic.h> #include <linux/bug.h> /* * bit-based spin_lock() * * Don't use this unless you really need to: spin_lock() and spin_unlock() * are significantly faster. */ static inline void bit_spin_lock(int bitnum, unsigned long *addr) { /* * Assuming the lock is uncontended, this never enters * the body of the outer loop. If it is contended, then * within the inner loop a non-atomic test is used to * busywait with less bus contention for a good time to * attempt to acquire the lock bit. */ preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) while (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); do { cpu_relax(); } while (test_bit(bitnum, addr)); preempt_disable(); } #endif __acquire(bitlock); } /* * Return true if it was acquired */ static inline int bit_spin_trylock(int bitnum, unsigned long *addr) { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) if (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); return 0; } #endif __acquire(bitlock); return 1; } /* * bit-based spin_unlock() */ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * bit-based spin_unlock() * non-atomic version, which can be used eg. if the bit lock itself is * protecting the rest of the flags in the word. */ static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) __clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * Return true if the lock is held. */ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) { #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) return test_bit(bitnum, addr); #elif defined CONFIG_PREEMPT_COUNT return preempt_count(); #else return 1; #endif } #endif /* __LINUX_BIT_SPINLOCK_H */ |
| 863 869 825 791 851 868 59 59 59 840 826 5 869 868 799 825 797 5 5 766 766 113 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) #define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #include <linux/page_table_check.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #ifdef CONFIG_HIGHPTE #define __pte_map(pmd, address) \ ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline void pte_unmap(pte_t *pte) { rcu_read_unlock(); } #endif void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #ifndef pgd_offset_k #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) #endif /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef pmd_young static inline int pmd_young(pmd_t pmd) { return 0; } #endif #ifndef pmd_dirty static inline int pmd_dirty(pmd_t pmd) { return 0; } #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. This mode can only be entered and left under the protection of * the page table locks for all page tables which may be modified. In the UP * case, this is required so that preemption is disabled, and in the SMP case, * it must synchronize the delayed page table writes properly on other CPUs. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif #ifndef set_ptes #ifndef pte_next_pfn static inline pte_t pte_next_pfn(pte_t pte) { return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); } #endif /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. * @addr: Address to map the first page at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @nr: Number of pages to map. * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * * Context: The caller holds the page table lock. The pages all belong * to the same folio. The PTEs are all in the same PMD. */ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { page_table_check_ptes_set(mm, ptep, pte, nr); arch_enter_lazy_mmu_mode(); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_next_pfn(pte); } arch_leave_lazy_mmu_mode(); } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); } #endif #ifndef pudp_get static inline pud_t pudp_get(pud_t *pudp) { return READ_ONCE(*pudp); } #endif #ifndef p4dp_get static inline p4d_t p4dp_get(p4d_t *p4dp) { return READ_ONCE(*p4dp); } #endif #ifndef pgdp_get static inline pgd_t pgdp_get(pgd_t *pgdp) { return READ_ONCE(*pgdp); } #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef arch_has_hw_nonleaf_pmd_young /* * Return whether the accessed bit in non-leaf PMD entries is supported on the * local CPU. */ static inline bool arch_has_hw_nonleaf_pmd_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG); } #endif #ifndef arch_has_hw_pte_young /* * Return whether the accessed bit is supported on the local CPU. * * This stub assumes accessing through an old PTE triggers a page fault. * Architectures that automatically set the access bit should overwrite it. */ static inline bool arch_has_hw_pte_young(void) { return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif #ifndef arch_check_zapped_pte static inline void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { } #endif #ifndef arch_check_zapped_pmd static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { } #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); page_table_check_pte_clear(mm, pte); return pte; } #endif static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep_get_and_clear(mm, addr, ptep); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive * instructions. We are guaranteed that a PTE will only either go from not * present to present, or present to not present -- it will not switch to a * completely different present page without a TLB flush inbetween; which we * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #define ptep_get_lockless ptep_get_lockless #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { pmd_t pmd; do { pmd.pmd_low = pmdp->pmd_low; smp_rmb(); pmd.pmd_high = pmdp->pmd_high; smp_rmb(); } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); return pmd; } #define pmdp_get_lockless pmdp_get_lockless #define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ #ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif #ifndef pmdp_get_lockless static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } static inline void pmdp_get_lockless_sync(void) { } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); page_table_check_pmd_clear(mm, pmd); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); page_table_check_pud_clear(mm, pud); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { return ptep_get_and_clear(mm, address, ptep); } #endif /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { } #define __HAVE_ARCH_UPDATE_MMU_TLB #endif /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef pte_mkwrite static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { return pte_mkwrite_novma(pte); } #endif #if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { return pmd_mkwrite_novma(pmd); } #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = ptep_get(ptep); set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD /* * pmdp_invalidate_ad() invalidates the PMD while changing a transparent * hugepage mapping in the page tables. This function is similar to * pmdp_invalidate(), but should only be used if the access and dirty bits would * not be cleared by the software in the new PMD value. The function ensures * that hardware changes of the access and dirty bits updates would not be lost. * * Doing so can allow in certain architectures to avoid a TLB flush in most * cases. Yet, another TLB flush might be necessary later if the PMD update * itself requires such flush (e.g., if protection was set to be stricter). Yet, * even when a TLB flush is needed because of the update, the caller may be able * to batch these TLB flushing operations, so fewer TLB flush operations are * needed. */ extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } #endif #ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif /* * Use set_p*_safe(), and elide TLB flushing, when confident that *no* * TLB flush will be required as a result of the "set". For example, use * in scenarios where it is known ahead of time that the routine is * setting non-present entries, or re-setting an existing entry to the * same value. Otherwise, use the typical "set" helpers and flush the * TLB. */ #define set_pte_safe(ptep, pte) \ ({ \ WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ set_pte(ptep, pte); \ }) #define set_pmd_safe(pmdp, pmd) \ ({ \ WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ set_pmd(pmdp, pmd); \ }) #define set_pud_safe(pudp, pud) \ ({ \ WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ set_pud(pudp, pud); \ }) #define set_p4d_safe(p4dp, p4d) \ ({ \ WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ set_p4d(p4dp, p4d); \ }) #define set_pgd_safe(pgdp, pgd) \ ({ \ WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ set_pgd(pgdp, pgd); \ }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte) { } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct page *page) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) { } #endif #ifndef __HAVE_ARCH_PGD_OFFSET_GATE #define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ /* * track_pfn_remap is called when a _new_ pfn mapping is being established * by remap_pfn_range() for physical range indicated by pfn and size. */ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { return 0; } /* * track_pfn_insert is called when a _new_ single pfn is established * by vmf_insert_pfn(). */ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { } /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). */ static inline int track_pfn_copy(struct vm_area_struct *vma) { return 0; } /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked) { } /* * untrack_pfn_clear is called while mremapping a pfnmap for a new region * or fails to copy pgtable during duplicate vm area. */ static inline void untrack_pfn_clear(struct vm_area_struct *vma) { } #else extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size); extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked); extern void untrack_pfn_clear(struct vm_area_struct *vma); #endif #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #else static inline int is_zero_pfn(unsigned long pfn) { return 0; } static inline unsigned long my_zero_pfn(unsigned long addr) { return 0; } #endif /* CONFIG_MMU */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { return 0; } static inline int pud_devmap(pud_t pud) { return 0; } static inline int pgd_devmap(pgd_t pgd) { return 0; } #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } #endif return 0; } #ifndef CONFIG_NUMA_BALANCING /* * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is * perfectly valid to indicate "no" in that case, which is why our default * implementation defaults to "always no". * * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE * page protection due to NUMA hinting. NUMA hinting faults only apply in * accessible VMAs. * * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); void p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline void p4d_clear_huge(p4d_t *p4d) { } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() set of functions an in the generic * vmalloc/ioremap code to track at which page-table levels entries have been * modified. Based on that the code can better decide when vmalloc and ioremap * mapping changes need to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif #ifndef has_transparent_pud_hugepage #define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * p?d_leaf() - true if this entry is a final mapping to a physical address. * This differs from p?d_huge() by the fact that they are always available (if * the architecture supports large pages at the appropriate level) even * if CONFIG_HUGETLB_PAGE is not defined. * Only meaningful when called on a valid entry. */ #ifndef pgd_leaf #define pgd_leaf(x) 0 #endif #ifndef p4d_leaf #define p4d_leaf(x) 0 #endif #ifndef pud_leaf #define pud_leaf(x) 0 #endif #ifndef pmd_leaf #define pmd_leaf(x) 0 #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: * * map_type prot * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (yes) yes w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes * * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and * MAP_PRIVATE (with Enhanced PAN supported): * r: (no) no * w: (no) no * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ pgprot_t vm_get_page_prot(unsigned long vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ } \ EXPORT_SYMBOL(vm_get_page_prot); #endif /* _LINUX_PGTABLE_H */ |
| 91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | /* SPDX-License-Identifier: GPL-2.0 */ /* * kernel/workqueue_internal.h * * Workqueue internal header file. Only to be included by workqueue and * core kernel subsystems. */ #ifndef _KERNEL_WORKQUEUE_INTERNAL_H #define _KERNEL_WORKQUEUE_INTERNAL_H #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/preempt.h> struct worker_pool; /* * The poor guys doing the actual heavy lifting. All on-duty workers are * either serving the manager role, on idle list or on busy hash. For * details on the locking annotation (L, I, X...), refer to workqueue.c. * * Only to be used in workqueue and async. */ struct worker { /* on idle list while idle, on busy hash table while busy */ union { struct list_head entry; /* L: while idle */ struct hlist_node hentry; /* L: while busy */ }; struct work_struct *current_work; /* K: work being processed and its */ work_func_t current_func; /* K: function */ struct pool_workqueue *current_pwq; /* K: pwq */ u64 current_at; /* K: runtime at start or last wakeup */ unsigned int current_color; /* K: color */ int sleeping; /* S: is worker sleeping? */ /* used by the scheduler to determine a worker's last known identity */ work_func_t last_func; /* K: last work's fn */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* A: the associated pool */ /* L: for rescuers */ struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ unsigned long last_active; /* K: last active timestamp */ unsigned int flags; /* L: flags */ int id; /* I: worker id */ /* * Opaque string set with work_set_desc(). Printed out with task * dump for debugging - WARN, BUG, panic or sysrq. */ char desc[WORKER_DESC_LEN]; /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; /** * current_wq_worker - return struct worker if %current is a workqueue worker */ static inline struct worker *current_wq_worker(void) { if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } /* * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched/ and workqueue.c. */ void wq_worker_running(struct task_struct *task); void wq_worker_sleeping(struct task_struct *task); void wq_worker_tick(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |
| 20 20 10 10 9 10 10 10 10 10 10 1 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * * Copyright (C) 1991, 1992 Linus Torvalds * * table of configured filesystems */ #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> /* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or * 2) we hold the reference to the module. * The latter can be guaranteed by call of try_module_get(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { __module_get(fs->owner); return fs; } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len]) break; return p; } /** * register_filesystem - register a new filesystem * @fs: the file system structure * * Adds the file system passed to the list of file systems the kernel * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; } EXPORT_SYMBOL(register_filesystem); /** * unregister_filesystem - unregister a file system * @fs: filesystem to unregister * * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ int unregister_filesystem(struct file_system_type * fs) { struct file_system_type ** tmp; write_lock(&file_systems_lock); tmp = &file_systems; while (*tmp) { if (fs == *tmp) { *tmp = fs->next; fs->next = NULL; write_unlock(&file_systems_lock); synchronize_rcu(); return 0; } tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); return -EINVAL; } EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; struct filename *name; int err, index; name = getname(__name); err = PTR_ERR(name); if (IS_ERR(name)) return err; err = -EINVAL; read_lock(&file_systems_lock); for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { if (strcmp(tmp->name, name->name) == 0) { err = index; break; } } read_unlock(&file_systems_lock); putname(name); return err; } static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; int len, res; read_lock(&file_systems_lock); for (tmp = file_systems; tmp; tmp = tmp->next, index--) if (index <= 0 && try_module_get(tmp->owner)) break; read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; put_filesystem(tmp); return res; } static int fs_maxindex(void) { struct file_system_type * tmp; int index; read_lock(&file_systems_lock); for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) ; read_unlock(&file_systems_lock); return index; } /* * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; switch (option) { case 1: retval = fs_index((const char __user *) arg1); break; case 2: retval = fs_name(arg1, (char __user *) arg2); break; case 3: retval = fs_maxindex(); break; } return retval; } #endif int __init list_bdev_fs_names(char *buf, size_t size) { struct file_system_type *p; size_t len; int count = 0; read_lock(&file_systems_lock); for (p = file_systems; p; p = p->next) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; if (len > size) { pr_warn("%s: truncating file system list\n", __func__); break; } memcpy(buf, p->name, len); buf += len; size -= len; count++; } read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { struct file_system_type * tmp; read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); return 0; } static int __init proc_filesystems_init(void) { proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); #endif static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); return fs; } struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); if (!fs) pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); fs = NULL; } return fs; } EXPORT_SYMBOL(get_fs_type); |
| 4 4 4 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 | // SPDX-License-Identifier: GPL-2.0-only /* * scsi_pm.c Copyright (C) 2010 Alan Stern * * SCSI dynamic Power Management * Initial version: Alan Stern <stern@rowland.harvard.edu> */ #include <linux/pm_runtime.h> #include <linux/export.h> #include <linux/blk-pm.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_host.h> #include "scsi_priv.h" #ifdef CONFIG_PM_SLEEP static int do_scsi_suspend(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->suspend ? pm->suspend(dev) : 0; } static int do_scsi_freeze(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->freeze ? pm->freeze(dev) : 0; } static int do_scsi_poweroff(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->poweroff ? pm->poweroff(dev) : 0; } static int do_scsi_resume(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->resume ? pm->resume(dev) : 0; } static int do_scsi_thaw(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->thaw ? pm->thaw(dev) : 0; } static int do_scsi_restore(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->restore ? pm->restore(dev) : 0; } static int scsi_dev_type_suspend(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err; err = scsi_device_quiesce(to_scsi_device(dev)); if (err == 0) { err = cb(dev, pm); if (err) scsi_device_resume(to_scsi_device(dev)); } dev_dbg(dev, "scsi suspend: %d\n", err); return err; } static int scsi_bus_suspend_common(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { if (!scsi_is_sdev_device(dev)) return 0; return scsi_dev_type_suspend(dev, cb); } static int scsi_bus_resume_common(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err; if (!scsi_is_sdev_device(dev)) return 0; err = cb(dev, pm); scsi_device_resume(to_scsi_device(dev)); dev_dbg(dev, "scsi resume: %d\n", err); return err; } static int scsi_bus_prepare(struct device *dev) { if (scsi_is_host_device(dev)) { /* Wait until async scanning is finished */ scsi_complete_async_scans(); } return 0; } static int scsi_bus_suspend(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_suspend); } static int scsi_bus_resume(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_resume); } static int scsi_bus_freeze(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_freeze); } static int scsi_bus_thaw(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_thaw); } static int scsi_bus_poweroff(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_poweroff); } static int scsi_bus_restore(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_restore); } #else /* CONFIG_PM_SLEEP */ #define scsi_bus_prepare NULL #define scsi_bus_suspend NULL #define scsi_bus_resume NULL #define scsi_bus_freeze NULL #define scsi_bus_thaw NULL #define scsi_bus_poweroff NULL #define scsi_bus_restore NULL #endif /* CONFIG_PM_SLEEP */ static int sdev_runtime_suspend(struct device *dev) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; struct scsi_device *sdev = to_scsi_device(dev); int err = 0; err = blk_pre_runtime_suspend(sdev->request_queue); if (err) return err; if (pm && pm->runtime_suspend) err = pm->runtime_suspend(dev); blk_post_runtime_suspend(sdev->request_queue, err); return err; } static int scsi_runtime_suspend(struct device *dev) { int err = 0; dev_dbg(dev, "scsi_runtime_suspend\n"); if (scsi_is_sdev_device(dev)) err = sdev_runtime_suspend(dev); /* Insert hooks here for targets, hosts, and transport classes */ return err; } static int sdev_runtime_resume(struct device *dev) { struct scsi_device *sdev = to_scsi_device(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err = 0; blk_pre_runtime_resume(sdev->request_queue); if (pm && pm->runtime_resume) err = pm->runtime_resume(dev); blk_post_runtime_resume(sdev->request_queue); return err; } static int scsi_runtime_resume(struct device *dev) { int err = 0; dev_dbg(dev, "scsi_runtime_resume\n"); if (scsi_is_sdev_device(dev)) err = sdev_runtime_resume(dev); /* Insert hooks here for targets, hosts, and transport classes */ return err; } static int scsi_runtime_idle(struct device *dev) { dev_dbg(dev, "scsi_runtime_idle\n"); /* Insert hooks here for targets, hosts, and transport classes */ if (scsi_is_sdev_device(dev)) { pm_runtime_mark_last_busy(dev); pm_runtime_autosuspend(dev); return -EBUSY; } return 0; } int scsi_autopm_get_device(struct scsi_device *sdev) { int err; err = pm_runtime_get_sync(&sdev->sdev_gendev); if (err < 0 && err !=-EACCES) pm_runtime_put_sync(&sdev->sdev_gendev); else err = 0; return err; } EXPORT_SYMBOL_GPL(scsi_autopm_get_device); void scsi_autopm_put_device(struct scsi_device *sdev) { pm_runtime_put_sync(&sdev->sdev_gendev); } EXPORT_SYMBOL_GPL(scsi_autopm_put_device); void scsi_autopm_get_target(struct scsi_target *starget) { pm_runtime_get_sync(&starget->dev); } void scsi_autopm_put_target(struct scsi_target *starget) { pm_runtime_put_sync(&starget->dev); } int scsi_autopm_get_host(struct Scsi_Host *shost) { int err; err = pm_runtime_get_sync(&shost->shost_gendev); if (err < 0 && err !=-EACCES) pm_runtime_put_sync(&shost->shost_gendev); else err = 0; return err; } void scsi_autopm_put_host(struct Scsi_Host *shost) { pm_runtime_put_sync(&shost->shost_gendev); } const struct dev_pm_ops scsi_bus_pm_ops = { .prepare = scsi_bus_prepare, .suspend = scsi_bus_suspend, .resume = scsi_bus_resume, .freeze = scsi_bus_freeze, .thaw = scsi_bus_thaw, .poweroff = scsi_bus_poweroff, .restore = scsi_bus_restore, .runtime_suspend = scsi_runtime_suspend, .runtime_resume = scsi_runtime_resume, .runtime_idle = scsi_runtime_idle, }; |
| 430 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | /* SPDX-License-Identifier: GPL-2.0 */ /* * 'Generic' ticket-lock implementation. * * It relies on atomic_fetch_add() having well defined forward progress * guarantees under contention. If your architecture cannot provide this, stick * to a test-and-set lock. * * It also relies on atomic_fetch_add() being safe vs smp_store_release() on a * sub-word of the value. This is generally true for anything LL/SC although * you'd be hard pressed to find anything useful in architecture specifications * about this. If your architecture cannot do this you might be better off with * a test-and-set. * * It further assumes atomic_*_release() + atomic_*_acquire() is RCpc and hence * uses atomic_fetch_add() which is RCsc to create an RCsc hot path, along with * a full fence after the spin to upgrade the otherwise-RCpc * atomic_cond_read_acquire(). * * The implementation uses smp_cond_load_acquire() to spin, so if the * architecture has WFE like instructions to sleep instead of poll for word * modifications be sure to implement that (see ARM64 for example). * */ #ifndef __ASM_GENERIC_SPINLOCK_H #define __ASM_GENERIC_SPINLOCK_H #include <linux/atomic.h> #include <asm-generic/spinlock_types.h> static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { u32 val = atomic_fetch_add(1<<16, lock); u16 ticket = val >> 16; if (ticket == (u16)val) return; /* * atomic_cond_read_acquire() is RCpc, but rather than defining a * custom cond_read_rcsc() here we just emit a full fence. We only * need the prior reads before subsequent writes ordering from * smb_mb(), but as atomic_cond_read_acquire() just emits reads and we * have no outstanding writes due to the atomic_fetch_add() the extra * orderings are free. */ atomic_cond_read_acquire(lock, ticket == (u16)VAL); smp_mb(); } static __always_inline bool arch_spin_trylock(arch_spinlock_t *lock) { u32 old = atomic_read(lock); if ((old >> 16) != (old & 0xffff)) return false; return atomic_try_cmpxchg(lock, &old, old + (1<<16)); /* SC, for RCsc */ } static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { u16 *ptr = (u16 *)lock + IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); u32 val = atomic_read(lock); smp_store_release(ptr, (u16)val + 1); } static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) { u32 val = lock.counter; return ((val >> 16) == (val & 0xffff)); } static __always_inline int arch_spin_is_locked(arch_spinlock_t *lock) { arch_spinlock_t val = READ_ONCE(*lock); return !arch_spin_value_unlocked(val); } static __always_inline int arch_spin_is_contended(arch_spinlock_t *lock) { u32 val = atomic_read(lock); return (s16)((val >> 16) - (val & 0xffff)) > 1; } #include <asm/qrwlock.h> #endif /* __ASM_GENERIC_SPINLOCK_H */ |
| 7 560 61 11 15 1 562 70 570 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | // SPDX-License-Identifier: GPL-2.0 /* * udc.c - Core UDC Framework * * Copyright (C) 2016 Intel Corporation * Author: Felipe Balbi <felipe.balbi@linux.intel.com> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM gadget #if !defined(__UDC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __UDC_TRACE_H #include <linux/types.h> #include <linux/tracepoint.h> #include <asm/byteorder.h> #include <linux/usb/gadget.h> DECLARE_EVENT_CLASS(udc_log_gadget, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret), TP_STRUCT__entry( __field(enum usb_device_speed, speed) __field(enum usb_device_speed, max_speed) __field(enum usb_device_state, state) __field(unsigned, mA) __field(unsigned, sg_supported) __field(unsigned, is_otg) __field(unsigned, is_a_peripheral) __field(unsigned, b_hnp_enable) __field(unsigned, a_hnp_support) __field(unsigned, hnp_polling_support) __field(unsigned, host_request_flag) __field(unsigned, quirk_ep_out_aligned_size) __field(unsigned, quirk_altset_not_supp) __field(unsigned, quirk_stall_not_supp) __field(unsigned, quirk_zlp_not_supp) __field(unsigned, is_selfpowered) __field(unsigned, deactivated) __field(unsigned, connected) __field(int, ret) ), TP_fast_assign( __entry->speed = g->speed; __entry->max_speed = g->max_speed; __entry->state = g->state; __entry->mA = g->mA; __entry->sg_supported = g->sg_supported; __entry->is_otg = g->is_otg; __entry->is_a_peripheral = g->is_a_peripheral; __entry->b_hnp_enable = g->b_hnp_enable; __entry->a_hnp_support = g->a_hnp_support; __entry->hnp_polling_support = g->hnp_polling_support; __entry->host_request_flag = g->host_request_flag; __entry->quirk_ep_out_aligned_size = g->quirk_ep_out_aligned_size; __entry->quirk_altset_not_supp = g->quirk_altset_not_supp; __entry->quirk_stall_not_supp = g->quirk_stall_not_supp; __entry->quirk_zlp_not_supp = g->quirk_zlp_not_supp; __entry->is_selfpowered = g->is_selfpowered; __entry->deactivated = g->deactivated; __entry->connected = g->connected; __entry->ret = ret; ), TP_printk("speed %d/%d state %d %dmA [%s%s%s%s%s%s%s%s%s%s%s%s%s%s] --> %d", __entry->speed, __entry->max_speed, __entry->state, __entry->mA, __entry->sg_supported ? "sg:" : "", __entry->is_otg ? "OTG:" : "", __entry->is_a_peripheral ? "a_peripheral:" : "", __entry->b_hnp_enable ? "b_hnp:" : "", __entry->a_hnp_support ? "a_hnp:" : "", __entry->hnp_polling_support ? "hnp_poll:" : "", __entry->host_request_flag ? "hostreq:" : "", __entry->quirk_ep_out_aligned_size ? "out_aligned:" : "", __entry->quirk_altset_not_supp ? "no_altset:" : "", __entry->quirk_stall_not_supp ? "no_stall:" : "", __entry->quirk_zlp_not_supp ? "no_zlp" : "", __entry->is_selfpowered ? "self-powered:" : "bus-powered:", __entry->deactivated ? "deactivated:" : "activated:", __entry->connected ? "connected" : "disconnected", __entry->ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_frame_number, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_wakeup, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_set_remote_wakeup, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_set_selfpowered, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_clear_selfpowered, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_vbus_connect, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_vbus_draw, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_vbus_disconnect, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_connect, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_disconnect, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_deactivate, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DEFINE_EVENT(udc_log_gadget, usb_gadget_activate, TP_PROTO(struct usb_gadget *g, int ret), TP_ARGS(g, ret) ); DECLARE_EVENT_CLASS(udc_log_ep, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret), TP_STRUCT__entry( __string(name, ep->name) __field(unsigned, maxpacket) __field(unsigned, maxpacket_limit) __field(unsigned, max_streams) __field(unsigned, mult) __field(unsigned, maxburst) __field(u8, address) __field(bool, claimed) __field(bool, enabled) __field(int, ret) ), TP_fast_assign( __assign_str(name, ep->name); __entry->maxpacket = ep->maxpacket; __entry->maxpacket_limit = ep->maxpacket_limit; __entry->max_streams = ep->max_streams; __entry->mult = ep->mult; __entry->maxburst = ep->maxburst; __entry->address = ep->address, __entry->claimed = ep->claimed; __entry->enabled = ep->enabled; __entry->ret = ret; ), TP_printk("%s: mps %d/%d streams %d mult %d burst %d addr %02x %s%s --> %d", __get_str(name), __entry->maxpacket, __entry->maxpacket_limit, __entry->max_streams, __entry->mult, __entry->maxburst, __entry->address, __entry->claimed ? "claimed:" : "released:", __entry->enabled ? "enabled" : "disabled", ret) ); DEFINE_EVENT(udc_log_ep, usb_ep_set_maxpacket_limit, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret) ); DEFINE_EVENT(udc_log_ep, usb_ep_enable, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret) ); DEFINE_EVENT(udc_log_ep, usb_ep_disable, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret) ); DEFINE_EVENT(udc_log_ep, usb_ep_set_halt, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret) ); DEFINE_EVENT(udc_log_ep, usb_ep_clear_halt, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret) ); DEFINE_EVENT(udc_log_ep, usb_ep_set_wedge, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret) ); DEFINE_EVENT(udc_log_ep, usb_ep_fifo_status, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret) ); DEFINE_EVENT(udc_log_ep, usb_ep_fifo_flush, TP_PROTO(struct usb_ep *ep, int ret), TP_ARGS(ep, ret) ); DECLARE_EVENT_CLASS(udc_log_req, TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret), TP_ARGS(ep, req, ret), TP_STRUCT__entry( __string(name, ep->name) __field(unsigned, length) __field(unsigned, actual) __field(unsigned, num_sgs) __field(unsigned, num_mapped_sgs) __field(unsigned, stream_id) __field(unsigned, no_interrupt) __field(unsigned, zero) __field(unsigned, short_not_ok) __field(int, status) __field(int, ret) __field(struct usb_request *, req) ), TP_fast_assign( __assign_str(name, ep->name); __entry->length = req->length; __entry->actual = req->actual; __entry->num_sgs = req->num_sgs; __entry->num_mapped_sgs = req->num_mapped_sgs; __entry->stream_id = req->stream_id; __entry->no_interrupt = req->no_interrupt; __entry->zero = req->zero; __entry->short_not_ok = req->short_not_ok; __entry->status = req->status; __entry->ret = ret; __entry->req = req; ), TP_printk("%s: req %p length %d/%d sgs %d/%d stream %d %s%s%s status %d --> %d", __get_str(name),__entry->req, __entry->actual, __entry->length, __entry->num_mapped_sgs, __entry->num_sgs, __entry->stream_id, __entry->zero ? "Z" : "z", __entry->short_not_ok ? "S" : "s", __entry->no_interrupt ? "i" : "I", __entry->status, __entry->ret ) ); DEFINE_EVENT(udc_log_req, usb_ep_alloc_request, TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret), TP_ARGS(ep, req, ret) ); DEFINE_EVENT(udc_log_req, usb_ep_free_request, TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret), TP_ARGS(ep, req, ret) ); DEFINE_EVENT(udc_log_req, usb_ep_queue, TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret), TP_ARGS(ep, req, ret) ); DEFINE_EVENT(udc_log_req, usb_ep_dequeue, TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret), TP_ARGS(ep, req, ret) ); DEFINE_EVENT(udc_log_req, usb_gadget_giveback_request, TP_PROTO(struct usb_ep *ep, struct usb_request *req, int ret), TP_ARGS(ep, req, ret) ); #endif /* __UDC_TRACE_H */ /* this part has to be here */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
| 39 40 38 40 40 184 40 187 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables mangle table"); #define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, }; static unsigned int ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret, verdict; const struct iphdr *iph; __be32 saddr, daddr; u32 mark; int err; u8 tos; /* Save things which could affect route */ mark = skb->mark; iph = ip_hdr(skb); saddr = iph->saddr; daddr = iph->daddr; tos = iph->tos; ret = ipt_do_table(priv, skb, state); verdict = ret & NF_VERDICT_MASK; /* Reroute for ANY change. */ if (verdict != NF_DROP && verdict != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } } return ret; } /* The work comes in here from netfilter.c. */ static unsigned int iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(priv, skb, state); return ipt_do_table(priv, skb, state); } static struct nf_hook_ops *mangle_ops __read_mostly; static int iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; int ret; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { ipt_unregister_table_pre_exit(net, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) { ipt_unregister_table_exit(net, "mangle"); } static struct pernet_operations iptable_mangle_net_ops = { .pre_exit = iptable_mangle_net_pre_exit, .exit = iptable_mangle_net_exit, }; static int __init iptable_mangle_init(void) { int ret = xt_register_template(&packet_mangler, iptable_mangle_table_init); if (ret < 0) return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); if (IS_ERR(mangle_ops)) { xt_unregister_template(&packet_mangler); ret = PTR_ERR(mangle_ops); return ret; } ret = register_pernet_subsys(&iptable_mangle_net_ops); if (ret < 0) { xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } return ret; } static void __exit iptable_mangle_fini(void) { unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); kfree(mangle_ops); } module_init(iptable_mangle_init); module_exit(iptable_mangle_fini); |
| 1 1 1 5 2 5 4 4 4 2 4 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/dst_cache.c - dst entry cache * * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> */ #include <linux/kernel.h> #include <linux/percpu.h> #include <net/dst_cache.h> #include <net/route.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_fib.h> #endif #include <uapi/linux/in.h> struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; u32 cookie; union { struct in_addr in_saddr; struct in6_addr in6_saddr; }; }; static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, struct dst_entry *dst, u32 cookie) { dst_release(dst_cache->dst); if (dst) dst_hold(dst); dst_cache->cookie = cookie; dst_cache->dst = dst; } static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, struct dst_cache_pcpu *idst) { struct dst_entry *dst; dst = idst->dst; if (!dst) goto fail; /* the cache already hold a dst reference; it can't go away */ dst_hold(dst); if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); goto fail; } return dst; fail: idst->refresh_ts = jiffies; return NULL; } struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { if (!dst_cache->cache) return NULL; return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); } EXPORT_SYMBOL_GPL(dst_cache_get); struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) { struct dst_cache_pcpu *idst; struct dst_entry *dst; if (!dst_cache->cache) return NULL; idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); if (!dst) return NULL; *saddr = idst->in_saddr.s_addr; return container_of(dst, struct rtable, dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, __be32 saddr) { struct dst_cache_pcpu *idst; if (!dst_cache->cache) return; idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); #if IS_ENABLED(CONFIG_IPV6) void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, const struct in6_addr *saddr) { struct dst_cache_pcpu *idst; if (!dst_cache->cache) return; idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, rt6_get_cookie((struct rt6_info *)dst)); idst->in6_saddr = *saddr; } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, struct in6_addr *saddr) { struct dst_cache_pcpu *idst; struct dst_entry *dst; if (!dst_cache->cache) return NULL; idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); if (!dst) return NULL; *saddr = idst->in6_saddr; return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); #endif int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; dst_cache_reset(dst_cache); return 0; } EXPORT_SYMBOL_GPL(dst_cache_init); void dst_cache_destroy(struct dst_cache *dst_cache) { int i; if (!dst_cache->cache) return; for_each_possible_cpu(i) dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); free_percpu(dst_cache->cache); } EXPORT_SYMBOL_GPL(dst_cache_destroy); void dst_cache_reset_now(struct dst_cache *dst_cache) { int i; if (!dst_cache->cache) return; dst_cache->reset_ts = jiffies; for_each_possible_cpu(i) { struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); struct dst_entry *dst = idst->dst; idst->cookie = 0; idst->dst = NULL; dst_release(dst); } } EXPORT_SYMBOL_GPL(dst_cache_reset_now); |
| 996 996 1241 8 84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | #ifndef _LINUX_HASH_H #define _LINUX_HASH_H /* Fast hashing routine for ints, longs and pointers. (C) 2002 Nadia Yvette Chambers, IBM */ #include <asm/types.h> #include <linux/compiler.h> /* * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and * fs/inode.c. It's not actually prime any more (the previous primes * were actively bad for hashing), but the name remains. */ #if BITS_PER_LONG == 32 #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32 #define hash_long(val, bits) hash_32(val, bits) #elif BITS_PER_LONG == 64 #define hash_long(val, bits) hash_64(val, bits) #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64 #else #error Wordsize not 32 or 64 #endif /* * This hash multiplies the input by a large odd number and takes the * high bits. Since multiplication propagates changes to the most * significant end only, it is essential that the high bits of the * product be used for the hash value. * * Chuck Lever verified the effectiveness of this technique: * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf * * Although a random odd number will do, it turns out that the golden * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice * properties. (See Knuth vol 3, section 6.4, exercise 9.) * * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2, * which is very slightly easier to multiply by and makes no * difference to the hash distribution. */ #define GOLDEN_RATIO_32 0x61C88647 #define GOLDEN_RATIO_64 0x61C8864680B583EBull #ifdef CONFIG_HAVE_ARCH_HASH /* This header may use the GOLDEN_RATIO_xx constants */ #include <asm/hash.h> #endif /* * The _generic versions exist only so lib/test_hash.c can compare * the arch-optimized versions with the generic. * * Note that if you change these, any <asm/hash.h> that aren't updated * to match need to have their HAVE_ARCH_* define values updated so the * self-test will not false-positive. */ #ifndef HAVE_ARCH__HASH_32 #define __hash_32 __hash_32_generic #endif static inline u32 __hash_32_generic(u32 val) { return val * GOLDEN_RATIO_32; } static inline u32 hash_32(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); } #ifndef HAVE_ARCH_HASH_64 #define hash_64 hash_64_generic #endif static __always_inline u32 hash_64_generic(u64 val, unsigned int bits) { #if BITS_PER_LONG == 64 /* 64x64-bit multiply is efficient on all 64-bit processors */ return val * GOLDEN_RATIO_64 >> (64 - bits); #else /* Hash 64 bits using only 32x32-bit multiply. */ return hash_32((u32)val ^ __hash_32(val >> 32), bits); #endif } static inline u32 hash_ptr(const void *ptr, unsigned int bits) { return hash_long((unsigned long)ptr, bits); } /* This really should be called fold32_ptr; it does no hashing to speak of. */ static inline u32 hash32_ptr(const void *ptr) { unsigned long val = (unsigned long)ptr; #if BITS_PER_LONG == 64 val ^= (val >> 32); #endif return (u32)val; } #endif /* _LINUX_HASH_H */ |
| 235 243 10 168 165 168 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_INETDEVICE_H #define _LINUX_INETDEVICE_H #ifdef __KERNEL__ #include <linux/bitmap.h> #include <linux/if.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/rcupdate.h> #include <linux/timer.h> #include <linux/sysctl.h> #include <linux/rtnetlink.h> #include <linux/refcount.h> struct ipv4_devconf { void *sysctl; int data[IPV4_DEVCONF_MAX]; DECLARE_BITMAP(state, IPV4_DEVCONF_MAX); }; #define MC_HASH_SZ_LOG 9 struct in_device { struct net_device *dev; netdevice_tracker dev_tracker; refcount_t refcnt; int dead; struct in_ifaddr __rcu *ifa_list;/* IP ifaddr chain */ struct ip_mc_list __rcu *mc_list; /* IP multicast filter chain */ struct ip_mc_list __rcu * __rcu *mc_hash; int mc_count; /* Number of installed mcasts */ spinlock_t mc_tomb_lock; struct ip_mc_list *mc_tomb; unsigned long mr_v1_seen; unsigned long mr_v2_seen; unsigned long mr_maxdelay; unsigned long mr_qi; /* Query Interval */ unsigned long mr_qri; /* Query Response Interval */ unsigned char mr_qrv; /* Query Robustness Variable */ unsigned char mr_gq_running; u32 mr_ifc_count; struct timer_list mr_gq_timer; /* general query timer */ struct timer_list mr_ifc_timer; /* interface change timer */ struct neigh_parms *arp_parms; struct ipv4_devconf cnf; struct rcu_head rcu_head; }; #define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1]) #define IPV4_DEVCONF_ALL(net, attr) \ IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr) static inline int ipv4_devconf_get(struct in_device *in_dev, int index) { index--; return in_dev->cnf.data[index]; } static inline void ipv4_devconf_set(struct in_device *in_dev, int index, int val) { index--; set_bit(index, in_dev->cnf.state); in_dev->cnf.data[index] = val; } static inline void ipv4_devconf_setall(struct in_device *in_dev) { bitmap_fill(in_dev->cnf.state, IPV4_DEVCONF_MAX); } #define IN_DEV_CONF_GET(in_dev, attr) \ ipv4_devconf_get((in_dev), IPV4_DEVCONF_ ## attr) #define IN_DEV_CONF_SET(in_dev, attr, val) \ ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val)) #define IN_DEV_ANDCONF(in_dev, attr) \ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr) && \ IN_DEV_CONF_GET((in_dev), attr)) #define IN_DEV_NET_ORCONF(in_dev, net, attr) \ (IPV4_DEVCONF_ALL(net, attr) || \ IN_DEV_CONF_GET((in_dev), attr)) #define IN_DEV_ORCONF(in_dev, attr) \ IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr) #define IN_DEV_MAXCONF(in_dev, attr) \ (max(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr), \ IN_DEV_CONF_GET((in_dev), attr))) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) #define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING) #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) #define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ ACCEPT_SOURCE_ROUTE) #define IN_DEV_ACCEPT_LOCAL(in_dev) IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL) #define IN_DEV_BOOTP_RELAY(in_dev) IN_DEV_ANDCONF((in_dev), BOOTP_RELAY) #define IN_DEV_LOG_MARTIANS(in_dev) IN_DEV_ORCONF((in_dev), LOG_MARTIANS) #define IN_DEV_PROXY_ARP(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP) #define IN_DEV_PROXY_ARP_PVLAN(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP_PVLAN) #define IN_DEV_SHARED_MEDIA(in_dev) IN_DEV_ORCONF((in_dev), SHARED_MEDIA) #define IN_DEV_TX_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), SEND_REDIRECTS) #define IN_DEV_SEC_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), \ SECURE_REDIRECTS) #define IN_DEV_IDTAG(in_dev) IN_DEV_CONF_GET(in_dev, TAG) #define IN_DEV_MEDIUM_ID(in_dev) IN_DEV_CONF_GET(in_dev, MEDIUM_ID) #define IN_DEV_PROMOTE_SECONDARIES(in_dev) \ IN_DEV_ORCONF((in_dev), \ PROMOTE_SECONDARIES) #define IN_DEV_ROUTE_LOCALNET(in_dev) IN_DEV_ORCONF(in_dev, ROUTE_LOCALNET) #define IN_DEV_NET_ROUTE_LOCALNET(in_dev, net) \ IN_DEV_NET_ORCONF(in_dev, net, ROUTE_LOCALNET) #define IN_DEV_RX_REDIRECTS(in_dev) \ ((IN_DEV_FORWARD(in_dev) && \ IN_DEV_ANDCONF((in_dev), ACCEPT_REDIRECTS)) \ || (!IN_DEV_FORWARD(in_dev) && \ IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS))) #define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \ IN_DEV_ORCONF((in_dev), IGNORE_ROUTES_WITH_LINKDOWN) #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) #define IN_DEV_ARP_ACCEPT(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ACCEPT) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) #define IN_DEV_ARP_IGNORE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_IGNORE) #define IN_DEV_ARP_NOTIFY(in_dev) IN_DEV_MAXCONF((in_dev), ARP_NOTIFY) #define IN_DEV_ARP_EVICT_NOCARRIER(in_dev) IN_DEV_ANDCONF((in_dev), \ ARP_EVICT_NOCARRIER) struct in_ifaddr { struct hlist_node hash; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; struct rcu_head rcu_head; __be32 ifa_local; __be32 ifa_address; __be32 ifa_mask; __u32 ifa_rt_priority; __be32 ifa_broadcast; unsigned char ifa_scope; unsigned char ifa_prefixlen; unsigned char ifa_proto; __u32 ifa_flags; char ifa_label[IFNAMSIZ]; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 ifa_valid_lft; __u32 ifa_preferred_lft; unsigned long ifa_cstamp; /* created timestamp */ unsigned long ifa_tstamp; /* updated timestamp */ }; struct in_validator_info { __be32 ivi_addr; struct in_device *ivi_dev; struct netlink_ext_ack *extack; }; int register_inetaddr_notifier(struct notifier_block *nb); int unregister_inetaddr_notifier(struct notifier_block *nb); int register_inetaddr_validator_notifier(struct notifier_block *nb); int unregister_inetaddr_validator_notifier(struct notifier_block *nb); void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf); struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref); static inline struct net_device *ip_dev_find(struct net *net, __be32 addr) { return __ip_dev_find(net, addr, true); } int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b); int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *); #ifdef CONFIG_INET int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size); #else static inline int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { return 0; } #endif void devinet_init(void); struct in_device *inetdev_by_index(struct net *, int); __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope); __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask); struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr); static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa) { return !((addr^ifa->ifa_address)&ifa->ifa_mask); } /* * Check if a mask is acceptable. */ static __inline__ bool bad_mask(__be32 mask, __be32 addr) { __u32 hmask; if (addr & (mask = ~mask)) return true; hmask = ntohl(mask); if (hmask & (hmask+1)) return true; return false; } #define in_dev_for_each_ifa_rtnl(ifa, in_dev) \ for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa; \ ifa = rtnl_dereference(ifa->ifa_next)) #define in_dev_for_each_ifa_rcu(ifa, in_dev) \ for (ifa = rcu_dereference((in_dev)->ifa_list); ifa; \ ifa = rcu_dereference(ifa->ifa_next)) static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->ip_ptr); } static inline struct in_device *in_dev_get(const struct net_device *dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) refcount_inc(&in_dev->refcnt); rcu_read_unlock(); return in_dev; } static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->ip_ptr); } /* called with rcu_read_lock or rtnl held */ static inline bool ip_ignore_linkdown(const struct net_device *dev) { struct in_device *in_dev; bool rc = false; in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) rc = true; return rc; } static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); return in_dev ? in_dev->arp_parms : NULL; } void in_dev_finish_destroy(struct in_device *idev); static inline void in_dev_put(struct in_device *idev) { if (refcount_dec_and_test(&idev->refcnt)) in_dev_finish_destroy(idev); } #define __in_dev_put(idev) refcount_dec(&(idev)->refcnt) #define in_dev_hold(idev) refcount_inc(&(idev)->refcnt) #endif /* __KERNEL__ */ static __inline__ __be32 inet_make_mask(int logmask) { if (logmask) return htonl(~((1U<<(32-logmask))-1)); return 0; } static __inline__ int inet_mask_len(__be32 mask) { __u32 hmask = ntohl(mask); if (!hmask) return 0; return 32 - ffz(~hmask); } #endif /* _LINUX_INETDEVICE_H */ |
| 111 111 112 112 112 112 111 112 112 112 112 112 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 directory handling functions * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Hash Tree Directory indexing (c) 2001 Daniel Phillips * */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "ext4.h" #include "xattr.h" static int ext4_dx_readdir(struct file *, struct dir_context *); /** * is_dx_dir() - check if a directory is using htree indexing * @inode: directory inode * * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which could potentially get converted to use htree * indexing). * * Return 1 if it is a dx dir, 0 if not */ static int is_dx_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; if (ext4_has_feature_dir_index(inode->i_sb) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) return 1; return 0; } static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) { /* Check if . or .. , or skip if namelen is 0 */ if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && (de->name[1] == '.' || de->name[1] == '\0')) return true; /* Check if this is a csum entry */ if (de->file_type == EXT4_FT_DIR_CSUM) return true; return false; } /* * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... * * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); bool has_csum = ext4_has_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; else if (unlikely(next_offset > size - ext4_dir_rec_len(1, has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; else return 0; if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), rlen, size, fake); return 1; } static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; int i; struct ext4_dir_entry_2 *de; int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); err = fscrypt_prepare_readdir(inode); if (err) return err; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) return err; /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. */ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } if (ext4_has_inline_data(inode)) { int has_inline_data = 1; err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) return err; } if (IS_ENCRYPTED(inode)) { err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr); if (err < 0) return err; } while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto errout; } cond_resched(); offset = ctx->pos & (sb->s_blocksize - 1); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); if (err == 0) { /* m_len should never be zero but let's avoid * an infinite loop if it somehow is */ if (map.m_len == 0) map.m_len = 1; ctx->pos += map.m_len * sb->s_blocksize; continue; } if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_SHIFT - inode->i_blkbits); if (!ra_has_index(&file->f_ra, index)) page_cache_sync_readahead( sb->s_bdev->bd_inode->i_mapping, &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; goto errout; } } if (!bh) { /* corrupt size? Maybe no more blocks to read */ if (ctx->pos > inode->i_blocks << 9) break; ctx->pos += sb->s_blocksize - offset; continue; } /* Check the checksum */ if (!buffer_verified(bh) && !ext4_dirblock_csum_verify(inode, bh)) { EXT4_ERROR_FILE(file, 0, "directory fails checksum " "at offset %llu", (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); bh = NULL; continue; } set_buffer_verified(bh); /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid * dirent right now. Scan from the start of the block * to make sure. */ if (!inode_eq_iversion(inode, file->f_version)) { for (i = 0; i < sb->s_blocksize && i < offset; ) { de = (struct ext4_dir_entry_2 *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this * loop, but we do have to test at * least that it is non-zero. A * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) < ext4_dir_rec_len(1, inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } offset = i; ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) | offset; file->f_version = inode_query_iversion(inode); } while (ctx->pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); if (ext4_check_dir_entry(inode, file, de, bh, bh->b_data, bh->b_size, offset)) { /* * On error, skip to the next block */ ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1; break; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { int save_len = fstr.len; struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, EXT4_DIRENT_HASH(de), EXT4_DIRENT_MINOR_HASH(de), &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) goto errout; if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } if ((ctx->pos < inode->i_size) && !dir_relax_shared(inode)) goto done; brelse(bh); bh = NULL; } done: err = 0; errout: fscrypt_fname_free_buffer(&fstr); brelse(bh); return err; } static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif } /* * These functions convert from the major/minor hash to an f_pos * value for dx directories * * Upper layer (for example NFS) should specify FMODE_32BITHASH or * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted * directly on both 32-bit and 64-bit nodes, under such case, neither * FMODE_32BITHASH nor FMODE_64BITHASH is specified. */ static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return major >> 1; else return ((__u64)(major >> 1) << 32) | (__u64)minor; } static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return (pos << 1) & 0xffffffff; else return ((pos >> 32) << 1) & 0xffffffff; } static inline __u32 pos2min_hash(struct file *filp, loff_t pos) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return 0; else return pos & 0xffffffff; } /* * Return 32- or 64-bit end-of-file for dx directories */ static inline loff_t ext4_get_htree_eof(struct file *filp) { if ((filp->f_mode & FMODE_32BITHASH) || (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) return EXT4_HTREE_EOF_32BIT; else return EXT4_HTREE_EOF_64BIT; } /* * ext4_dir_llseek() calls generic_file_llseek_size to handle htree * directories, where the "offset" is in terms of the filename hash * value instead of the byte offset. * * Because we may return a 64-bit hash that is well beyond offset limits, * we need to pass the max hash as the maximum allowable offset in * the htree directory case. * * For non-htree, ext4_llseek already chooses the proper max offset. */ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int dx_dir = is_dx_dir(inode); loff_t ret, htree_max = ext4_get_htree_eof(file); if (likely(dx_dir)) ret = generic_file_llseek_size(file, offset, whence, htree_max, htree_max); else ret = ext4_llseek(file, offset, whence); file->f_version = inode_peek_iversion(inode) - 1; return ret; } /* * This structure holds the nodes of the red-black tree used to store * the directory entry in hash order. */ struct fname { __u32 hash; __u32 minor_hash; struct rb_node rb_hash; struct fname *next; __u32 inode; __u8 name_len; __u8 file_type; char name[]; }; /* * This function implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) { struct fname *fname, *next; rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } *root = RB_ROOT; } static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, loff_t pos) { struct dir_private_info *p; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; p->curr_hash = pos2maj_hash(filp, pos); p->curr_minor_hash = pos2min_hash(filp, pos); return p; } void ext4_htree_free_dir_info(struct dir_private_info *p) { free_rb_tree_fname(&p->root); kfree(p); } /* * Given a directory entry, enter it into the fname rb tree. * * When filename encryption is enabled, the dirent will hold the * encrypted filename, while the htree will hold decrypted filename. * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; int len; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ len = sizeof(struct fname) + ent_name->len + 1; new_fn = kzalloc(len, GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); while (*p) { parent = *p; fname = rb_entry(parent, struct fname, rb_hash); /* * If the hash and minor hash match up, then we put * them on a linked list. This rarely happens... */ if ((new_fn->hash == fname->hash) && (new_fn->minor_hash == fname->minor_hash)) { new_fn->next = fname->next; fname->next = new_fn; return 0; } if (new_fn->hash < fname->hash) p = &(*p)->rb_left; else if (new_fn->hash > fname->hash) p = &(*p)->rb_right; else if (new_fn->minor_hash < fname->minor_hash) p = &(*p)->rb_left; else /* if (new_fn->minor_hash > fname->minor_hash) */ p = &(*p)->rb_right; } rb_link_node(&new_fn->rb_hash, parent, p); rb_insert_color(&new_fn->rb_hash, &info->root); return 0; } /* * This is a helper function for ext4_dx_readdir. It calls filldir * for all entries on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, struct fname *fname) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; if (!fname) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; } ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, get_dtype(sb, fname->file_type))) { info->extra_fname = fname; return 1; } fname = fname->next; } return 0; } static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) { struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; int ret = 0; if (!info) { info = ext4_htree_create_dir_info(file, ctx->pos); if (!info) return -ENOMEM; file->private_data = info; } if (ctx->pos == ext4_get_htree_eof(file)) return 0; /* EOF */ /* Some one has messed with f_pos; reset the world */ if (info->last_pos != ctx->pos) { free_rb_tree_fname(&info->root); info->curr_node = NULL; info->extra_fname = NULL; info->curr_hash = pos2maj_hash(file, ctx->pos); info->curr_minor_hash = pos2min_hash(file, ctx->pos); } /* * If there are any leftover names on the hash collision * chain, return them first. */ if (info->extra_fname) { if (call_filldir(file, ctx, info->extra_fname)) goto finished; info->extra_fname = NULL; goto next_node; } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { /* * Fill the rbtree if we have no more entries, * or the inode has changed since we last read in the * cached entries. */ if ((!info->curr_node) || !inode_eq_iversion(inode, file->f_version)) { info->curr_node = NULL; free_rb_tree_fname(&info->root); file->f_version = inode_query_iversion(inode); ret = ext4_htree_fill_tree(file, info->curr_hash, info->curr_minor_hash, &info->next_hash); if (ret < 0) goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_node = rb_first(&info->root); } fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; if (call_filldir(file, ctx, fname)) break; next_node: info->curr_node = rb_next(info->curr_node); if (info->curr_node) { fname = rb_entry(info->curr_node, struct fname, rb_hash); info->curr_hash = fname->hash; info->curr_minor_hash = fname->minor_hash; } else { if (info->next_hash == ~0) { ctx->pos = ext4_get_htree_eof(file); break; } info->curr_hash = info->next_hash; info->curr_minor_hash = 0; } } finished: info->last_pos = ctx->pos; return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size) { struct ext4_dir_entry_2 *de; int rlen; unsigned int offset = 0; char *top; de = buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) return -EFSCORRUPTED; rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) return -EFSCORRUPTED; return 0; } const struct file_operations ext4_dir_operations = { .llseek = ext4_dir_llseek, .read = generic_read_dir, .iterate_shared = ext4_readdir, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, .release = ext4_release_dir, }; |
| 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio memory mapped device driver * * Copyright 2011-2014, ARM Ltd. * * This module allows virtio devices to be used over a virtual, memory mapped * platform device. * * The guest device(s) may be instantiated in one of three equivalent ways: * * 1. Static platform device in board's code, eg.: * * static struct platform_device v2m_virtio_device = { * .name = "virtio-mmio", * .id = -1, * .num_resources = 2, * .resource = (struct resource []) { * { * .start = 0x1001e000, * .end = 0x1001e0ff, * .flags = IORESOURCE_MEM, * }, { * .start = 42 + 32, * .end = 42 + 32, * .flags = IORESOURCE_IRQ, * }, * } * }; * * 2. Device Tree node, eg.: * * virtio_block@1e000 { * compatible = "virtio,mmio"; * reg = <0x1e000 0x100>; * interrupts = <42>; * } * * 3. Kernel module (or command line) parameter. Can be used more than once - * one device will be created for each one. Syntax: * * [virtio_mmio.]device=<size>@<baseaddr>:<irq>[:<id>] * where: * <size> := size (can use standard suffixes like K, M or G) * <baseaddr> := physical base address * <irq> := interrupt number (as passed to request_irq()) * <id> := (optional) platform device id * eg.: * virtio_mmio.device=0x100@0x100b0000:48 \ * virtio_mmio.device=1K@0x1001e000:74 * * Based on Virtio PCI driver by Anthony Liguori, copyright IBM Corp. 2007 */ #define pr_fmt(fmt) "virtio-mmio: " fmt #include <linux/acpi.h> #include <linux/dma-mapping.h> #include <linux/highmem.h> #include <linux/interrupt.h> #include <linux/io.h> #include <linux/list.h> #include <linux/module.h> #include <linux/of.h> #include <linux/platform_device.h> #include <linux/pm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/virtio.h> #include <linux/virtio_config.h> #include <uapi/linux/virtio_mmio.h> #include <linux/virtio_ring.h> /* The alignment to use between consumer and producer parts of vring. * Currently hardcoded to the page size. */ #define VIRTIO_MMIO_VRING_ALIGN PAGE_SIZE #define to_virtio_mmio_device(_plat_dev) \ container_of(_plat_dev, struct virtio_mmio_device, vdev) struct virtio_mmio_device { struct virtio_device vdev; struct platform_device *pdev; void __iomem *base; unsigned long version; /* a list of queues so we can dispatch IRQs */ spinlock_t lock; struct list_head virtqueues; }; struct virtio_mmio_vq_info { /* the actual virtqueue */ struct virtqueue *vq; /* the list node for the virtqueues list */ struct list_head node; }; /* Configuration interface */ static u64 vm_get_features(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); u64 features; writel(1, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL); features = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES); features <<= 32; writel(0, vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES_SEL); features |= readl(vm_dev->base + VIRTIO_MMIO_DEVICE_FEATURES); return features; } static int vm_finalize_features(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); /* Make sure there are no mixed devices */ if (vm_dev->version == 2 && !__virtio_test_bit(vdev, VIRTIO_F_VERSION_1)) { dev_err(&vdev->dev, "New virtio-mmio devices (version 2) must provide VIRTIO_F_VERSION_1 feature!\n"); return -EINVAL; } writel(1, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL); writel((u32)(vdev->features >> 32), vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES); writel(0, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES_SEL); writel((u32)vdev->features, vm_dev->base + VIRTIO_MMIO_DRIVER_FEATURES); return 0; } static void vm_get(struct virtio_device *vdev, unsigned int offset, void *buf, unsigned int len) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG; u8 b; __le16 w; __le32 l; if (vm_dev->version == 1) { u8 *ptr = buf; int i; for (i = 0; i < len; i++) ptr[i] = readb(base + offset + i); return; } switch (len) { case 1: b = readb(base + offset); memcpy(buf, &b, sizeof b); break; case 2: w = cpu_to_le16(readw(base + offset)); memcpy(buf, &w, sizeof w); break; case 4: l = cpu_to_le32(readl(base + offset)); memcpy(buf, &l, sizeof l); break; case 8: l = cpu_to_le32(readl(base + offset)); memcpy(buf, &l, sizeof l); l = cpu_to_le32(ioread32(base + offset + sizeof l)); memcpy(buf + sizeof l, &l, sizeof l); break; default: BUG(); } } static void vm_set(struct virtio_device *vdev, unsigned int offset, const void *buf, unsigned int len) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); void __iomem *base = vm_dev->base + VIRTIO_MMIO_CONFIG; u8 b; __le16 w; __le32 l; if (vm_dev->version == 1) { const u8 *ptr = buf; int i; for (i = 0; i < len; i++) writeb(ptr[i], base + offset + i); return; } switch (len) { case 1: memcpy(&b, buf, sizeof b); writeb(b, base + offset); break; case 2: memcpy(&w, buf, sizeof w); writew(le16_to_cpu(w), base + offset); break; case 4: memcpy(&l, buf, sizeof l); writel(le32_to_cpu(l), base + offset); break; case 8: memcpy(&l, buf, sizeof l); writel(le32_to_cpu(l), base + offset); memcpy(&l, buf + sizeof l, sizeof l); writel(le32_to_cpu(l), base + offset + sizeof l); break; default: BUG(); } } static u32 vm_generation(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); if (vm_dev->version == 1) return 0; else return readl(vm_dev->base + VIRTIO_MMIO_CONFIG_GENERATION); } static u8 vm_get_status(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); return readl(vm_dev->base + VIRTIO_MMIO_STATUS) & 0xff; } static void vm_set_status(struct virtio_device *vdev, u8 status) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); /* We should never be setting status to 0. */ BUG_ON(status == 0); /* * Per memory-barriers.txt, wmb() is not needed to guarantee * that the cache coherent memory writes have completed * before writing to the MMIO region. */ writel(status, vm_dev->base + VIRTIO_MMIO_STATUS); } static void vm_reset(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); /* 0 status means a reset. */ writel(0, vm_dev->base + VIRTIO_MMIO_STATUS); } /* Transport interface */ /* the notify function used when creating a virt queue */ static bool vm_notify(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); /* We write the queue's selector into the notification register to * signal the other end */ writel(vq->index, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); return true; } static bool vm_notify_with_data(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); u32 data = vring_notification_data(vq); writel(data, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); return true; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vm_interrupt(int irq, void *opaque) { struct virtio_mmio_device *vm_dev = opaque; struct virtio_mmio_vq_info *info; unsigned long status; unsigned long flags; irqreturn_t ret = IRQ_NONE; /* Read and acknowledge interrupts */ status = readl(vm_dev->base + VIRTIO_MMIO_INTERRUPT_STATUS); writel(status, vm_dev->base + VIRTIO_MMIO_INTERRUPT_ACK); if (unlikely(status & VIRTIO_MMIO_INT_CONFIG)) { virtio_config_changed(&vm_dev->vdev); ret = IRQ_HANDLED; } if (likely(status & VIRTIO_MMIO_INT_VRING)) { spin_lock_irqsave(&vm_dev->lock, flags); list_for_each_entry(info, &vm_dev->virtqueues, node) ret |= vring_interrupt(irq, info->vq); spin_unlock_irqrestore(&vm_dev->lock, flags); } return ret; } static void vm_del_vq(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); struct virtio_mmio_vq_info *info = vq->priv; unsigned long flags; unsigned int index = vq->index; spin_lock_irqsave(&vm_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vm_dev->lock, flags); /* Select and deactivate the queue */ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); if (vm_dev->version == 1) { writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); } else { writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); } vring_del_virtqueue(vq); kfree(info); } static void vm_del_vqs(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); struct virtqueue *vq, *n; list_for_each_entry_safe(vq, n, &vdev->vqs, list) vm_del_vq(vq); free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev); } static void vm_synchronize_cbs(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); synchronize_irq(platform_get_irq(vm_dev->pdev, 0)); } static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); bool (*notify)(struct virtqueue *vq); struct virtio_mmio_vq_info *info; struct virtqueue *vq; unsigned long flags; unsigned int num; int err; if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) notify = vm_notify_with_data; else notify = vm_notify; if (!name) return NULL; /* Select the queue we're interested in */ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); /* Queue shouldn't already be set up. */ if (readl(vm_dev->base + (vm_dev->version == 1 ? VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) { err = -ENOENT; goto error_available; } /* Allocate and fill out our active queue description */ info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) { err = -ENOMEM; goto error_kmalloc; } num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); if (num == 0) { err = -ENOENT; goto error_new_virtqueue; } /* Create the vring */ vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, true, true, ctx, notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; } vq->num_max = num; /* Activate the queue */ writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); if (vm_dev->version == 1) { u64 q_pfn = virtqueue_get_desc_addr(vq) >> PAGE_SHIFT; /* * virtio-mmio v1 uses a 32bit QUEUE PFN. If we have something * that doesn't fit in 32bit, fail the setup rather than * pretending to be successful. */ if (q_pfn >> 32) { dev_err(&vdev->dev, "platform bug: legacy virtio-mmio must not be used with RAM above 0x%llxGB\n", 0x1ULL << (32 + PAGE_SHIFT - 30)); err = -E2BIG; goto error_bad_pfn; } writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN); writel(q_pfn, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); } else { u64 addr; addr = virtqueue_get_desc_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH); addr = virtqueue_get_avail_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH); addr = virtqueue_get_used_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH); writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); } vq->priv = info; info->vq = vq; spin_lock_irqsave(&vm_dev->lock, flags); list_add(&info->node, &vm_dev->virtqueues); spin_unlock_irqrestore(&vm_dev->lock, flags); return vq; error_bad_pfn: vring_del_virtqueue(vq); error_new_virtqueue: if (vm_dev->version == 1) { writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); } else { writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); } kfree(info); error_kmalloc: error_available: return ERR_PTR(err); } static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], const bool *ctx, struct irq_affinity *desc) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); int irq = platform_get_irq(vm_dev->pdev, 0); int i, err, queue_idx = 0; if (irq < 0) return irq; err = request_irq(irq, vm_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vm_dev); if (err) return err; if (of_property_read_bool(vm_dev->pdev->dev.of_node, "wakeup-source")) enable_irq_wake(irq); for (i = 0; i < nvqs; ++i) { if (!names[i]) { vqs[i] = NULL; continue; } vqs[i] = vm_setup_vq(vdev, queue_idx++, callbacks[i], names[i], ctx ? ctx[i] : false); if (IS_ERR(vqs[i])) { vm_del_vqs(vdev); return PTR_ERR(vqs[i]); } } return 0; } static const char *vm_bus_name(struct virtio_device *vdev) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); return vm_dev->pdev->name; } static bool vm_get_shm_region(struct virtio_device *vdev, struct virtio_shm_region *region, u8 id) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); u64 len, addr; /* Select the region we're interested in */ writel(id, vm_dev->base + VIRTIO_MMIO_SHM_SEL); /* Read the region size */ len = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_LOW); len |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_LEN_HIGH) << 32; region->len = len; /* Check if region length is -1. If that's the case, the shared memory * region does not exist and there is no need to proceed further. */ if (len == ~(u64)0) return false; /* Read the region base address */ addr = (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_LOW); addr |= (u64) readl(vm_dev->base + VIRTIO_MMIO_SHM_BASE_HIGH) << 32; region->addr = addr; return true; } static const struct virtio_config_ops virtio_mmio_config_ops = { .get = vm_get, .set = vm_set, .generation = vm_generation, .get_status = vm_get_status, .set_status = vm_set_status, .reset = vm_reset, .find_vqs = vm_find_vqs, .del_vqs = vm_del_vqs, .get_features = vm_get_features, .finalize_features = vm_finalize_features, .bus_name = vm_bus_name, .get_shm_region = vm_get_shm_region, .synchronize_cbs = vm_synchronize_cbs, }; #ifdef CONFIG_PM_SLEEP static int virtio_mmio_freeze(struct device *dev) { struct virtio_mmio_device *vm_dev = dev_get_drvdata(dev); return virtio_device_freeze(&vm_dev->vdev); } static int virtio_mmio_restore(struct device *dev) { struct virtio_mmio_device *vm_dev = dev_get_drvdata(dev); if (vm_dev->version == 1) writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); return virtio_device_restore(&vm_dev->vdev); } static const struct dev_pm_ops virtio_mmio_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(virtio_mmio_freeze, virtio_mmio_restore) }; #endif static void virtio_mmio_release_dev(struct device *_d) { struct virtio_device *vdev = container_of(_d, struct virtio_device, dev); struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); kfree(vm_dev); } /* Platform device */ static int virtio_mmio_probe(struct platform_device *pdev) { struct virtio_mmio_device *vm_dev; unsigned long magic; int rc; vm_dev = kzalloc(sizeof(*vm_dev), GFP_KERNEL); if (!vm_dev) return -ENOMEM; vm_dev->vdev.dev.parent = &pdev->dev; vm_dev->vdev.dev.release = virtio_mmio_release_dev; vm_dev->vdev.config = &virtio_mmio_config_ops; vm_dev->pdev = pdev; INIT_LIST_HEAD(&vm_dev->virtqueues); spin_lock_init(&vm_dev->lock); vm_dev->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(vm_dev->base)) { rc = PTR_ERR(vm_dev->base); goto free_vm_dev; } /* Check magic value */ magic = readl(vm_dev->base + VIRTIO_MMIO_MAGIC_VALUE); if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) { dev_warn(&pdev->dev, "Wrong magic value 0x%08lx!\n", magic); rc = -ENODEV; goto free_vm_dev; } /* Check device version */ vm_dev->version = readl(vm_dev->base + VIRTIO_MMIO_VERSION); if (vm_dev->version < 1 || vm_dev->version > 2) { dev_err(&pdev->dev, "Version %ld not supported!\n", vm_dev->version); rc = -ENXIO; goto free_vm_dev; } vm_dev->vdev.id.device = readl(vm_dev->base + VIRTIO_MMIO_DEVICE_ID); if (vm_dev->vdev.id.device == 0) { /* * virtio-mmio device with an ID 0 is a (dummy) placeholder * with no function. End probing now with no error reported. */ rc = -ENODEV; goto free_vm_dev; } vm_dev->vdev.id.vendor = readl(vm_dev->base + VIRTIO_MMIO_VENDOR_ID); if (vm_dev->version == 1) { writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_GUEST_PAGE_SIZE); rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); /* * In the legacy case, ensure our coherently-allocated virtio * ring will be at an address expressable as a 32-bit PFN. */ if (!rc) dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32 + PAGE_SHIFT)); } else { rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); } if (rc) rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (rc) dev_warn(&pdev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); platform_set_drvdata(pdev, vm_dev); rc = register_virtio_device(&vm_dev->vdev); if (rc) put_device(&vm_dev->vdev.dev); return rc; free_vm_dev: kfree(vm_dev); return rc; } static int virtio_mmio_remove(struct platform_device *pdev) { struct virtio_mmio_device *vm_dev = platform_get_drvdata(pdev); unregister_virtio_device(&vm_dev->vdev); return 0; } /* Devices list parameter */ #if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES) static struct device vm_cmdline_parent = { .init_name = "virtio-mmio-cmdline", }; static int vm_cmdline_parent_registered; static int vm_cmdline_id; static int vm_cmdline_set(const char *device, const struct kernel_param *kp) { int err; struct resource resources[2] = {}; char *str; long long base, size; unsigned int irq; int processed, consumed = 0; struct platform_device *pdev; /* Consume "size" part of the command line parameter */ size = memparse(device, &str); /* Get "@<base>:<irq>[:<id>]" chunks */ processed = sscanf(str, "@%lli:%u%n:%d%n", &base, &irq, &consumed, &vm_cmdline_id, &consumed); /* * sscanf() must process at least 2 chunks; also there * must be no extra characters after the last chunk, so * str[consumed] must be '\0' */ if (processed < 2 || str[consumed] || irq == 0) return -EINVAL; resources[0].flags = IORESOURCE_MEM; resources[0].start = base; resources[0].end = base + size - 1; resources[1].flags = IORESOURCE_IRQ; resources[1].start = resources[1].end = irq; if (!vm_cmdline_parent_registered) { err = device_register(&vm_cmdline_parent); if (err) { put_device(&vm_cmdline_parent); pr_err("Failed to register parent device!\n"); return err; } vm_cmdline_parent_registered = 1; } pr_info("Registering device virtio-mmio.%d at 0x%llx-0x%llx, IRQ %d.\n", vm_cmdline_id, (unsigned long long)resources[0].start, (unsigned long long)resources[0].end, (int)resources[1].start); pdev = platform_device_register_resndata(&vm_cmdline_parent, "virtio-mmio", vm_cmdline_id++, resources, ARRAY_SIZE(resources), NULL, 0); return PTR_ERR_OR_ZERO(pdev); } static int vm_cmdline_get_device(struct device *dev, void *data) { char *buffer = data; unsigned int len = strlen(buffer); struct platform_device *pdev = to_platform_device(dev); snprintf(buffer + len, PAGE_SIZE - len, "0x%llx@0x%llx:%llu:%d\n", pdev->resource[0].end - pdev->resource[0].start + 1ULL, (unsigned long long)pdev->resource[0].start, (unsigned long long)pdev->resource[1].start, pdev->id); return 0; } static int vm_cmdline_get(char *buffer, const struct kernel_param *kp) { buffer[0] = '\0'; device_for_each_child(&vm_cmdline_parent, buffer, vm_cmdline_get_device); return strlen(buffer) + 1; } static const struct kernel_param_ops vm_cmdline_param_ops = { .set = vm_cmdline_set, .get = vm_cmdline_get, }; device_param_cb(device, &vm_cmdline_param_ops, NULL, S_IRUSR); static int vm_unregister_cmdline_device(struct device *dev, void *data) { platform_device_unregister(to_platform_device(dev)); return 0; } static void vm_unregister_cmdline_devices(void) { if (vm_cmdline_parent_registered) { device_for_each_child(&vm_cmdline_parent, NULL, vm_unregister_cmdline_device); device_unregister(&vm_cmdline_parent); vm_cmdline_parent_registered = 0; } } #else static void vm_unregister_cmdline_devices(void) { } #endif /* Platform driver */ static const struct of_device_id virtio_mmio_match[] = { { .compatible = "virtio,mmio", }, {}, }; MODULE_DEVICE_TABLE(of, virtio_mmio_match); #ifdef CONFIG_ACPI static const struct acpi_device_id virtio_mmio_acpi_match[] = { { "LNRO0005", }, { } }; MODULE_DEVICE_TABLE(acpi, virtio_mmio_acpi_match); #endif static struct platform_driver virtio_mmio_driver = { .probe = virtio_mmio_probe, .remove = virtio_mmio_remove, .driver = { .name = "virtio-mmio", .of_match_table = virtio_mmio_match, .acpi_match_table = ACPI_PTR(virtio_mmio_acpi_match), #ifdef CONFIG_PM_SLEEP .pm = &virtio_mmio_pm_ops, #endif }, }; static int __init virtio_mmio_init(void) { return platform_driver_register(&virtio_mmio_driver); } static void __exit virtio_mmio_exit(void) { platform_driver_unregister(&virtio_mmio_driver); vm_unregister_cmdline_devices(); } module_init(virtio_mmio_init); module_exit(virtio_mmio_exit); MODULE_AUTHOR("Pawel Moll <pawel.moll@arm.com>"); MODULE_DESCRIPTION("Platform bus driver for memory mapped virtio devices"); MODULE_LICENSE("GPL"); |
| 115 115 115 115 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ #include <linux/quotaops.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" /* * Convert from filesystem to in-memory representation. */ static struct posix_acl * ext4_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; struct posix_acl *acl; if (!value) return NULL; if (size < sizeof(ext4_acl_header)) return ERR_PTR(-EINVAL); if (((ext4_acl_header *)value)->a_version != cpu_to_le32(EXT4_ACL_VERSION)) return ERR_PTR(-EINVAL); value = (char *)value + sizeof(ext4_acl_header); count = ext4_acl_count(size); if (count < 0) return ERR_PTR(-EINVAL); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (n = 0; n < count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)value; if ((char *)value + sizeof(ext4_acl_entry_short) > end) goto fail; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); switch (acl->a_entries[n].e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: value = (char *)value + sizeof(ext4_acl_entry_short); break; case ACL_USER: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); break; case ACL_GROUP: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); break; default: goto fail; } } if (value != end) goto fail; return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } /* * Convert from in-memory to filesystem representation. */ static void * ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) { ext4_acl_header *ext_acl; char *e; size_t n; *size = ext4_acl_size(acl->a_count); ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * sizeof(ext4_acl_entry), GFP_NOFS); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); for (n = 0; n < acl->a_count; n++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext4_acl_entry *entry = (ext4_acl_entry *)e; entry->e_tag = cpu_to_le16(acl_e->e_tag); entry->e_perm = cpu_to_le16(acl_e->e_perm); switch (acl_e->e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); e += sizeof(ext4_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext4_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: e += sizeof(ext4_acl_entry_short); break; default: goto fail; } } return (char *)ext_acl; fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } /* * Inode operation get_posix_acl(). * * inode->i_rwsem: don't care */ struct posix_acl * ext4_get_acl(struct inode *inode, int type, bool rcu) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; if (rcu) return ERR_PTR(-ECHILD); switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; break; default: BUG(); } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { value = kmalloc(retval, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); retval = ext4_xattr_get(inode, name_index, "", value, retval); } if (retval > 0) acl = ext4_acl_from_disk(value, retval); else if (retval == -ENODATA || retval == -ENOSYS) acl = NULL; else acl = ERR_PTR(retval); kfree(value); return acl; } /* * Set the access or default ACL of an inode. * * inode->i_rwsem: down unless called from ext4_new_inode */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; size_t size = 0; int error; switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; default: return -EINVAL; } if (acl) { value = ext4_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } error = ext4_xattr_set_handle(handle, inode, name_index, "", value, size, xattr_flags); kfree(value); if (!error) set_cached_acl(inode, type, acl); return error; } int ext4_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; struct inode *inode = d_inode(dentry); umode_t mode = inode->i_mode; int update_mode = 0; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); if ((type == ACL_TYPE_ACCESS) && acl) { error = posix_acl_update_mode(idmap, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) update_mode = 1; } error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; inode_set_ctime_current(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return error; } /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * * dir->i_rwsem: down * inode->i_rwsem: up (access to inode is still exclusive) */ int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; int error; error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); } else { inode->i_acl = NULL; } return error; } |
| 107 107 107 107 107 107 107 107 107 107 107 107 107 107 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | // SPDX-License-Identifier: GPL-2.0-or-later /* * SHA-256, as specified in * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf * * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2014 Red Hat Inc. */ #include <asm/unaligned.h> #include <crypto/sha256_base.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> static const u32 SHA256_K[] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; static inline u32 Ch(u32 x, u32 y, u32 z) { return z ^ (x & (y ^ z)); } static inline u32 Maj(u32 x, u32 y, u32 z) { return (x & y) | (z & (x | y)); } #define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) #define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) #define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) #define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) static inline void LOAD_OP(int I, u32 *W, const u8 *input) { W[I] = get_unaligned_be32((__u32 *)input + I); } static inline void BLEND_OP(int I, u32 *W) { W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; } #define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do { \ u32 t1, t2; \ t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i]; \ t2 = e0(a) + Maj(a, b, c); \ d += t1; \ h = t1 + t2; \ } while (0) static void sha256_transform(u32 *state, const u8 *input, u32 *W) { u32 a, b, c, d, e, f, g, h; int i; /* load the input */ for (i = 0; i < 16; i += 8) { LOAD_OP(i + 0, W, input); LOAD_OP(i + 1, W, input); LOAD_OP(i + 2, W, input); LOAD_OP(i + 3, W, input); LOAD_OP(i + 4, W, input); LOAD_OP(i + 5, W, input); LOAD_OP(i + 6, W, input); LOAD_OP(i + 7, W, input); } /* now blend */ for (i = 16; i < 64; i += 8) { BLEND_OP(i + 0, W); BLEND_OP(i + 1, W); BLEND_OP(i + 2, W); BLEND_OP(i + 3, W); BLEND_OP(i + 4, W); BLEND_OP(i + 5, W); BLEND_OP(i + 6, W); BLEND_OP(i + 7, W); } /* load the state into our registers */ a = state[0]; b = state[1]; c = state[2]; d = state[3]; e = state[4]; f = state[5]; g = state[6]; h = state[7]; /* now iterate */ for (i = 0; i < 64; i += 8) { SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); } state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; state[5] += f; state[6] += g; state[7] += h; } static void sha256_transform_blocks(struct sha256_state *sctx, const u8 *input, int blocks) { u32 W[64]; do { sha256_transform(sctx->state, input, W); input += SHA256_BLOCK_SIZE; } while (--blocks); memzero_explicit(W, sizeof(W)); } void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) { lib_sha256_base_do_update(sctx, data, len, sha256_transform_blocks); } EXPORT_SYMBOL(sha256_update); static void __sha256_final(struct sha256_state *sctx, u8 *out, int digest_size) { lib_sha256_base_do_finalize(sctx, sha256_transform_blocks); lib_sha256_base_finish(sctx, out, digest_size); } void sha256_final(struct sha256_state *sctx, u8 *out) { __sha256_final(sctx, out, 32); } EXPORT_SYMBOL(sha256_final); void sha224_final(struct sha256_state *sctx, u8 *out) { __sha256_final(sctx, out, 28); } EXPORT_SYMBOL(sha224_final); void sha256(const u8 *data, unsigned int len, u8 *out) { struct sha256_state sctx; sha256_init(&sctx); sha256_update(&sctx, data, len); sha256_final(&sctx, out); } EXPORT_SYMBOL(sha256); MODULE_LICENSE("GPL"); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * These are the definitions needed for the sctp_ulpevent type. The * sctp_ulpevent type is used to carry information from the state machine * upwards to the ULP. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Sridhar Samudrala <sri@us.ibm.com> */ #ifndef __sctp_ulpevent_h__ #define __sctp_ulpevent_h__ /* A structure to carry information to the ULP (e.g. Sockets API) */ /* Warning: This sits inside an skb.cb[] area. Be very careful of * growing this structure as it is at the maximum limit now. * * sctp_ulpevent is saved in sk->cb(48 bytes), whose last 4 bytes * have been taken by sock_skb_cb, So here it has to use 'packed' * to make sctp_ulpevent fit into the rest 44 bytes. */ struct sctp_ulpevent { struct sctp_association *asoc; struct sctp_chunk *chunk; unsigned int rmem_len; union { __u32 mid; __u16 ssn; }; union { __u32 ppid; __u32 fsn; }; __u32 tsn; __u32 cumtsn; __u16 stream; __u16 flags; __u16 msg_flags; } __packed; /* Retrieve the skb this event sits inside of. */ static inline struct sk_buff *sctp_event2skb(const struct sctp_ulpevent *ev) { return container_of((void *)ev, struct sk_buff, cb); } /* Retrieve & cast the event sitting inside the skb. */ static inline struct sctp_ulpevent *sctp_skb2event(struct sk_buff *skb) { return (struct sctp_ulpevent *)skb->cb; } void sctp_ulpevent_free(struct sctp_ulpevent *); int sctp_ulpevent_is_notification(const struct sctp_ulpevent *); unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list); struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( const struct sctp_association *asoc, __u16 flags, __u16 state, __u16 error, __u16 outbound, __u16 inbound, struct sctp_chunk *chunk, gfp_t gfp); void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport, int state, int error); struct sctp_ulpevent *sctp_ulpevent_make_remote_error( const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_send_failed( const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, __u32 error, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, __u32 error, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( const struct sctp_association *asoc, __u16 flags, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_pdapi( const struct sctp_association *asoc, __u32 indication, __u32 sid, __u32 seq, __u32 flags, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication( const struct sctp_association *asoc, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_authkey( const struct sctp_association *asoc, __u16 key_id, __u32 indication, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( const struct sctp_association *asoc, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, __u16 stream_num, __be16 *stream_list, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( const struct sctp_association *asoc, __u16 flags, __u32 local_tsn, __u32 remote_tsn, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event( const struct sctp_association *asoc, __u16 flags, __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp); struct sctp_ulpevent *sctp_make_reassembled_event( struct net *net, struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag); void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *); void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, struct msghdr *); void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, struct msghdr *, struct sock *sk); __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event); static inline void sctp_ulpevent_type_set(__u16 *subscribe, __u16 sn_type, __u8 on) { if (sn_type > SCTP_SN_TYPE_MAX) return; if (on) *subscribe |= (1 << (sn_type - SCTP_SN_TYPE_BASE)); else *subscribe &= ~(1 << (sn_type - SCTP_SN_TYPE_BASE)); } /* Is this event type enabled? */ static inline bool sctp_ulpevent_type_enabled(__u16 subscribe, __u16 sn_type) { if (sn_type > SCTP_SN_TYPE_MAX) return false; return subscribe & (1 << (sn_type - SCTP_SN_TYPE_BASE)); } /* Given an event subscription, is this event enabled? */ static inline bool sctp_ulpevent_is_enabled(const struct sctp_ulpevent *event, __u16 subscribe) { __u16 sn_type; if (!sctp_ulpevent_is_notification(event)) return true; sn_type = sctp_ulpevent_get_notification_type(event); return sctp_ulpevent_type_enabled(subscribe, sn_type); } #endif /* __sctp_ulpevent_h__ */ |
| 13 13 13 13 7 7 7 7 7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer FIFO * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> */ #include <sound/core.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include "seq_fifo.h" #include "seq_lock.h" /* FIFO */ /* create new fifo */ struct snd_seq_fifo *snd_seq_fifo_new(int poolsize) { struct snd_seq_fifo *f; f = kzalloc(sizeof(*f), GFP_KERNEL); if (!f) return NULL; f->pool = snd_seq_pool_new(poolsize); if (f->pool == NULL) { kfree(f); return NULL; } if (snd_seq_pool_init(f->pool) < 0) { snd_seq_pool_delete(&f->pool); kfree(f); return NULL; } spin_lock_init(&f->lock); snd_use_lock_init(&f->use_lock); init_waitqueue_head(&f->input_sleep); atomic_set(&f->overflow, 0); f->head = NULL; f->tail = NULL; f->cells = 0; return f; } void snd_seq_fifo_delete(struct snd_seq_fifo **fifo) { struct snd_seq_fifo *f; if (snd_BUG_ON(!fifo)) return; f = *fifo; if (snd_BUG_ON(!f)) return; *fifo = NULL; if (f->pool) snd_seq_pool_mark_closing(f->pool); snd_seq_fifo_clear(f); /* wake up clients if any */ if (waitqueue_active(&f->input_sleep)) wake_up(&f->input_sleep); /* release resources...*/ /*....................*/ if (f->pool) { snd_seq_pool_done(f->pool); snd_seq_pool_delete(&f->pool); } kfree(f); } static struct snd_seq_event_cell *fifo_cell_out(struct snd_seq_fifo *f); /* clear queue */ void snd_seq_fifo_clear(struct snd_seq_fifo *f) { struct snd_seq_event_cell *cell; /* clear overflow flag */ atomic_set(&f->overflow, 0); snd_use_lock_sync(&f->use_lock); spin_lock_irq(&f->lock); /* drain the fifo */ while ((cell = fifo_cell_out(f)) != NULL) { snd_seq_cell_free(cell); } spin_unlock_irq(&f->lock); } /* enqueue event to fifo */ int snd_seq_fifo_event_in(struct snd_seq_fifo *f, struct snd_seq_event *event) { struct snd_seq_event_cell *cell; unsigned long flags; int err; if (snd_BUG_ON(!f)) return -EINVAL; snd_use_lock_use(&f->use_lock); err = snd_seq_event_dup(f->pool, event, &cell, 1, NULL, NULL); /* always non-blocking */ if (err < 0) { if ((err == -ENOMEM) || (err == -EAGAIN)) atomic_inc(&f->overflow); snd_use_lock_free(&f->use_lock); return err; } /* append new cells to fifo */ spin_lock_irqsave(&f->lock, flags); if (f->tail != NULL) f->tail->next = cell; f->tail = cell; if (f->head == NULL) f->head = cell; cell->next = NULL; f->cells++; spin_unlock_irqrestore(&f->lock, flags); /* wakeup client */ if (waitqueue_active(&f->input_sleep)) wake_up(&f->input_sleep); snd_use_lock_free(&f->use_lock); return 0; /* success */ } /* dequeue cell from fifo */ static struct snd_seq_event_cell *fifo_cell_out(struct snd_seq_fifo *f) { struct snd_seq_event_cell *cell; cell = f->head; if (cell) { f->head = cell->next; /* reset tail if this was the last element */ if (f->tail == cell) f->tail = NULL; cell->next = NULL; f->cells--; } return cell; } /* dequeue cell from fifo and copy on user space */ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f, struct snd_seq_event_cell **cellp, int nonblock) { struct snd_seq_event_cell *cell; unsigned long flags; wait_queue_entry_t wait; if (snd_BUG_ON(!f)) return -EINVAL; *cellp = NULL; init_waitqueue_entry(&wait, current); spin_lock_irqsave(&f->lock, flags); while ((cell = fifo_cell_out(f)) == NULL) { if (nonblock) { /* non-blocking - return immediately */ spin_unlock_irqrestore(&f->lock, flags); return -EAGAIN; } set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&f->input_sleep, &wait); spin_unlock_irqrestore(&f->lock, flags); schedule(); spin_lock_irqsave(&f->lock, flags); remove_wait_queue(&f->input_sleep, &wait); if (signal_pending(current)) { spin_unlock_irqrestore(&f->lock, flags); return -ERESTARTSYS; } } spin_unlock_irqrestore(&f->lock, flags); *cellp = cell; return 0; } void snd_seq_fifo_cell_putback(struct snd_seq_fifo *f, struct snd_seq_event_cell *cell) { unsigned long flags; if (cell) { spin_lock_irqsave(&f->lock, flags); cell->next = f->head; f->head = cell; if (!f->tail) f->tail = cell; f->cells++; spin_unlock_irqrestore(&f->lock, flags); } } /* polling; return non-zero if queue is available */ int snd_seq_fifo_poll_wait(struct snd_seq_fifo *f, struct file *file, poll_table *wait) { poll_wait(file, &f->input_sleep, wait); return (f->cells > 0); } /* change the size of pool; all old events are removed */ int snd_seq_fifo_resize(struct snd_seq_fifo *f, int poolsize) { struct snd_seq_pool *newpool, *oldpool; struct snd_seq_event_cell *cell, *next, *oldhead; if (snd_BUG_ON(!f || !f->pool)) return -EINVAL; /* allocate new pool */ newpool = snd_seq_pool_new(poolsize); if (newpool == NULL) return -ENOMEM; if (snd_seq_pool_init(newpool) < 0) { snd_seq_pool_delete(&newpool); return -ENOMEM; } spin_lock_irq(&f->lock); /* remember old pool */ oldpool = f->pool; oldhead = f->head; /* exchange pools */ f->pool = newpool; f->head = NULL; f->tail = NULL; f->cells = 0; /* NOTE: overflow flag is not cleared */ spin_unlock_irq(&f->lock); /* close the old pool and wait until all users are gone */ snd_seq_pool_mark_closing(oldpool); snd_use_lock_sync(&f->use_lock); /* release cells in old pool */ for (cell = oldhead; cell; cell = next) { next = cell->next; snd_seq_cell_free(cell); } snd_seq_pool_delete(&oldpool); return 0; } /* get the number of unused cells safely */ int snd_seq_fifo_unused_cells(struct snd_seq_fifo *f) { unsigned long flags; int cells; if (!f) return 0; snd_use_lock_use(&f->use_lock); spin_lock_irqsave(&f->lock, flags); cells = snd_seq_unused_cells(f->pool); spin_unlock_irqrestore(&f->lock, flags); snd_use_lock_free(&f->use_lock); return cells; } |
| 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_COMPAT_H #define _LINUX_COMPAT_H /* * These are the type definitions for the architecture specific * syscall compatibility layer. */ #include <linux/types.h> #include <linux/time.h> #include <linux/stat.h> #include <linux/param.h> /* for HZ */ #include <linux/sem.h> #include <linux/socket.h> #include <linux/if.h> #include <linux/fs.h> #include <linux/aio_abi.h> /* for aio_context_t */ #include <linux/uaccess.h> #include <linux/unistd.h> #include <asm/compat.h> #include <asm/siginfo.h> #include <asm/signal.h> #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER /* * It may be useful for an architecture to override the definitions of the * COMPAT_SYSCALL_DEFINE0 and COMPAT_SYSCALL_DEFINEx() macros, in particular * to use a different calling convention for syscalls. To allow for that, + the prototypes for the compat_sys_*() functions below will *not* be included * if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. */ #include <asm/syscall_wrapper.h> #endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ #ifndef COMPAT_USE_64BIT_TIME #define COMPAT_USE_64BIT_TIME 0 #endif #ifndef __SC_DELOUSE #define __SC_DELOUSE(t,v) ((__force t)(unsigned long)(v)) #endif #ifndef COMPAT_SYSCALL_DEFINE0 #define COMPAT_SYSCALL_DEFINE0(name) \ asmlinkage long compat_sys_##name(void); \ ALLOW_ERROR_INJECTION(compat_sys_##name, ERRNO); \ asmlinkage long compat_sys_##name(void) #endif /* COMPAT_SYSCALL_DEFINE0 */ #define COMPAT_SYSCALL_DEFINE1(name, ...) \ COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE2(name, ...) \ COMPAT_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE3(name, ...) \ COMPAT_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE4(name, ...) \ COMPAT_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE5(name, ...) \ COMPAT_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE6(name, ...) \ COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) /* * The asmlinkage stub is aliased to a function named __se_compat_sys_*() which * sign-extends 32-bit ints to longs whenever needed. The actual work is * done within __do_compat_sys_*(). */ #ifndef COMPAT_SYSCALL_DEFINEx #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments");\ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ __MAP(x,__SC_TEST,__VA_ARGS__); \ return ret; \ } \ __diag_pop(); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* COMPAT_SYSCALL_DEFINEx */ struct compat_iovec { compat_uptr_t iov_base; compat_size_t iov_len; }; #ifndef compat_user_stack_pointer #define compat_user_stack_pointer() current_user_stack_pointer() #endif #ifndef compat_sigaltstack /* we'll need that for MIPS */ typedef struct compat_sigaltstack { compat_uptr_t ss_sp; int ss_flags; compat_size_t ss_size; } compat_stack_t; #endif #ifndef COMPAT_MINSIGSTKSZ #define COMPAT_MINSIGSTKSZ MINSIGSTKSZ #endif #define compat_jiffies_to_clock_t(x) \ (((unsigned long)(x) * COMPAT_USER_HZ) / HZ) typedef __compat_uid32_t compat_uid_t; typedef __compat_gid32_t compat_gid_t; struct compat_sel_arg_struct; struct rusage; struct old_itimerval32; struct compat_tms { compat_clock_t tms_utime; compat_clock_t tms_stime; compat_clock_t tms_cutime; compat_clock_t tms_cstime; }; #define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize); struct compat_sigaction { #ifndef __ARCH_HAS_IRIX_SIGACTION compat_uptr_t sa_handler; compat_ulong_t sa_flags; #else compat_uint_t sa_flags; compat_uptr_t sa_handler; #endif #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t sa_restorer; #endif compat_sigset_t sa_mask __packed; }; typedef union compat_sigval { compat_int_t sival_int; compat_uptr_t sival_ptr; } compat_sigval_t; typedef struct compat_siginfo { int si_signo; #ifndef __ARCH_HAS_SWAPPED_SIGINFO int si_errno; int si_code; #else int si_code; int si_errno; #endif union { int _pad[128/sizeof(int) - 3]; /* kill() */ struct { compat_pid_t _pid; /* sender's pid */ __compat_uid32_t _uid; /* sender's uid */ } _kill; /* POSIX.1b timers */ struct { compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ compat_sigval_t _sigval; /* same as below */ } _timer; /* POSIX.1b signals */ struct { compat_pid_t _pid; /* sender's pid */ __compat_uid32_t _uid; /* sender's uid */ compat_sigval_t _sigval; } _rt; /* SIGCHLD */ struct { compat_pid_t _pid; /* which child */ __compat_uid32_t _uid; /* sender's uid */ int _status; /* exit code */ compat_clock_t _utime; compat_clock_t _stime; } _sigchld; #ifdef CONFIG_X86_X32_ABI /* SIGCHLD (x32 version) */ struct { compat_pid_t _pid; /* which child */ __compat_uid32_t _uid; /* sender's uid */ int _status; /* exit code */ compat_s64 _utime; compat_s64 _stime; } _sigchld_x32; #endif /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */ struct { compat_uptr_t _addr; /* faulting insn/memory ref. */ #define __COMPAT_ADDR_BND_PKEY_PAD (__alignof__(compat_uptr_t) < sizeof(short) ? \ sizeof(short) : __alignof__(compat_uptr_t)) union { /* used on alpha and sparc */ int _trapno; /* TRAP # which caused the signal */ /* * used when si_code=BUS_MCEERR_AR or * used when si_code=BUS_MCEERR_AO */ short int _addr_lsb; /* Valid LSB of the reported address. */ /* used when si_code=SEGV_BNDERR */ struct { char _dummy_bnd[__COMPAT_ADDR_BND_PKEY_PAD]; compat_uptr_t _lower; compat_uptr_t _upper; } _addr_bnd; /* used when si_code=SEGV_PKUERR */ struct { char _dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD]; u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ struct { compat_ulong_t _data; u32 _type; u32 _flags; } _perf; }; } _sigfault; /* SIGPOLL */ struct { compat_long_t _band; /* POLL_IN, POLL_OUT, POLL_MSG */ int _fd; } _sigpoll; struct { compat_uptr_t _call_addr; /* calling user insn */ int _syscall; /* triggering system call number */ unsigned int _arch; /* AUDIT_ARCH_* of syscall */ } _sigsys; } _sifields; } compat_siginfo_t; struct compat_rlimit { compat_ulong_t rlim_cur; compat_ulong_t rlim_max; }; #ifdef __ARCH_NEED_COMPAT_FLOCK64_PACKED #define __ARCH_COMPAT_FLOCK64_PACK __attribute__((packed)) #else #define __ARCH_COMPAT_FLOCK64_PACK #endif struct compat_flock { short l_type; short l_whence; compat_off_t l_start; compat_off_t l_len; #ifdef __ARCH_COMPAT_FLOCK_EXTRA_SYSID __ARCH_COMPAT_FLOCK_EXTRA_SYSID #endif compat_pid_t l_pid; #ifdef __ARCH_COMPAT_FLOCK_PAD __ARCH_COMPAT_FLOCK_PAD #endif }; struct compat_flock64 { short l_type; short l_whence; compat_loff_t l_start; compat_loff_t l_len; compat_pid_t l_pid; #ifdef __ARCH_COMPAT_FLOCK64_PAD __ARCH_COMPAT_FLOCK64_PAD #endif } __ARCH_COMPAT_FLOCK64_PACK; struct compat_rusage { struct old_timeval32 ru_utime; struct old_timeval32 ru_stime; compat_long_t ru_maxrss; compat_long_t ru_ixrss; compat_long_t ru_idrss; compat_long_t ru_isrss; compat_long_t ru_minflt; compat_long_t ru_majflt; compat_long_t ru_nswap; compat_long_t ru_inblock; compat_long_t ru_oublock; compat_long_t ru_msgsnd; compat_long_t ru_msgrcv; compat_long_t ru_nsignals; compat_long_t ru_nvcsw; compat_long_t ru_nivcsw; }; extern int put_compat_rusage(const struct rusage *, struct compat_rusage __user *); struct compat_siginfo; struct __compat_aio_sigset; struct compat_dirent { u32 d_ino; compat_off_t d_off; u16 d_reclen; char d_name[256]; }; struct compat_ustat { compat_daddr_t f_tfree; compat_ino_t f_tinode; char f_fname[6]; char f_fpack[6]; }; #define COMPAT_SIGEV_PAD_SIZE ((SIGEV_MAX_SIZE/sizeof(int)) - 3) typedef struct compat_sigevent { compat_sigval_t sigev_value; compat_int_t sigev_signo; compat_int_t sigev_notify; union { compat_int_t _pad[COMPAT_SIGEV_PAD_SIZE]; compat_int_t _tid; struct { compat_uptr_t _function; compat_uptr_t _attribute; } _sigev_thread; } _sigev_un; } compat_sigevent_t; struct compat_ifmap { compat_ulong_t mem_start; compat_ulong_t mem_end; unsigned short base_addr; unsigned char irq; unsigned char dma; unsigned char port; }; struct compat_if_settings { unsigned int type; /* Type of physical device or protocol */ unsigned int size; /* Size of the data allocated by the caller */ compat_uptr_t ifs_ifsu; /* union of pointers */ }; struct compat_ifreq { union { char ifrn_name[IFNAMSIZ]; /* if name, e.g. "en0" */ } ifr_ifrn; union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct sockaddr ifru_netmask; struct sockaddr ifru_hwaddr; short ifru_flags; compat_int_t ifru_ivalue; compat_int_t ifru_mtu; struct compat_ifmap ifru_map; char ifru_slave[IFNAMSIZ]; /* Just fits the size */ char ifru_newname[IFNAMSIZ]; compat_caddr_t ifru_data; struct compat_if_settings ifru_settings; } ifr_ifru; }; struct compat_ifconf { compat_int_t ifc_len; /* size of buffer */ compat_caddr_t ifcbuf; }; struct compat_robust_list { compat_uptr_t next; }; struct compat_robust_list_head { struct compat_robust_list list; compat_long_t futex_offset; compat_uptr_t list_op_pending; }; #ifdef CONFIG_COMPAT_OLD_SIGACTION struct compat_old_sigaction { compat_uptr_t sa_handler; compat_old_sigset_t sa_mask; compat_ulong_t sa_flags; compat_uptr_t sa_restorer; }; #endif struct compat_keyctl_kdf_params { compat_uptr_t hashname; compat_uptr_t otherinfo; __u32 otherinfolen; __u32 __spare[8]; }; struct compat_stat; struct compat_statfs; struct compat_statfs64; struct compat_old_linux_dirent; struct compat_linux_dirent; struct linux_dirent64; struct compat_msghdr; struct compat_mmsghdr; struct compat_sysinfo; struct compat_sysctl_args; struct compat_kexec_segment; struct compat_mq_attr; struct compat_msgbuf; void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from); int copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo __user *from); int __copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); #ifndef copy_siginfo_to_user32 #define copy_siginfo_to_user32 __copy_siginfo_to_user32 #endif int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat); /* * Defined inline such that size can be compile time constant, which avoids * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct */ static inline int put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, unsigned int size) { /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ #if defined(__BIG_ENDIAN) && defined(CONFIG_64BIT) compat_sigset_t v; switch (_NSIG_WORDS) { case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; fallthrough; case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; fallthrough; case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; fallthrough; case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; } return copy_to_user(compat, &v, size) ? -EFAULT : 0; #else return copy_to_user(compat, set, size) ? -EFAULT : 0; #endif } #ifdef CONFIG_CPU_BIG_ENDIAN #define unsafe_put_compat_sigset(compat, set, label) do { \ compat_sigset_t __user *__c = compat; \ const sigset_t *__s = set; \ \ switch (_NSIG_WORDS) { \ case 4: \ unsafe_put_user(__s->sig[3] >> 32, &__c->sig[7], label); \ unsafe_put_user(__s->sig[3], &__c->sig[6], label); \ fallthrough; \ case 3: \ unsafe_put_user(__s->sig[2] >> 32, &__c->sig[5], label); \ unsafe_put_user(__s->sig[2], &__c->sig[4], label); \ fallthrough; \ case 2: \ unsafe_put_user(__s->sig[1] >> 32, &__c->sig[3], label); \ unsafe_put_user(__s->sig[1], &__c->sig[2], label); \ fallthrough; \ case 1: \ unsafe_put_user(__s->sig[0] >> 32, &__c->sig[1], label); \ unsafe_put_user(__s->sig[0], &__c->sig[0], label); \ } \ } while (0) #define unsafe_get_compat_sigset(set, compat, label) do { \ const compat_sigset_t __user *__c = compat; \ compat_sigset_word hi, lo; \ sigset_t *__s = set; \ \ switch (_NSIG_WORDS) { \ case 4: \ unsafe_get_user(lo, &__c->sig[7], label); \ unsafe_get_user(hi, &__c->sig[6], label); \ __s->sig[3] = hi | (((long)lo) << 32); \ fallthrough; \ case 3: \ unsafe_get_user(lo, &__c->sig[5], label); \ unsafe_get_user(hi, &__c->sig[4], label); \ __s->sig[2] = hi | (((long)lo) << 32); \ fallthrough; \ case 2: \ unsafe_get_user(lo, &__c->sig[3], label); \ unsafe_get_user(hi, &__c->sig[2], label); \ __s->sig[1] = hi | (((long)lo) << 32); \ fallthrough; \ case 1: \ unsafe_get_user(lo, &__c->sig[1], label); \ unsafe_get_user(hi, &__c->sig[0], label); \ __s->sig[0] = hi | (((long)lo) << 32); \ } \ } while (0) #else #define unsafe_put_compat_sigset(compat, set, label) do { \ compat_sigset_t __user *__c = compat; \ const sigset_t *__s = set; \ \ unsafe_copy_to_user(__c, __s, sizeof(*__c), label); \ } while (0) #define unsafe_get_compat_sigset(set, compat, label) do { \ const compat_sigset_t __user *__c = compat; \ sigset_t *__s = set; \ \ unsafe_copy_from_user(__s, __c, sizeof(*__c), label); \ } while (0) #endif extern int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); struct epoll_event; /* fortunately, this one is fixed-layout */ int compat_restore_altstack(const compat_stack_t __user *uss); int __compat_save_altstack(compat_stack_t __user *, unsigned long); #define unsafe_compat_save_altstack(uss, sp, label) do { \ compat_stack_t __user *__uss = uss; \ struct task_struct *t = current; \ unsafe_put_user(ptr_to_compat((void __user *)t->sas_ss_sp), \ &__uss->ss_sp, label); \ unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \ unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \ } while (0); /* * These syscall function prototypes are kept in the same order as * include/uapi/asm-generic/unistd.h. Deprecated or obsolete system calls * go below. * * Please note that these prototypes here are only provided for information * purposes, for static analysis, and for linking from the syscall table. * These functions should not be called elsewhere from kernel code. * * As the syscall calling convention may be different from the default * for architectures overriding the syscall calling convention, do not * include the prototypes if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. */ #ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p); asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr, u32 __user *iocb); asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id, compat_long_t min_nr, compat_long_t nr, struct io_event __user *events, struct old_timespec32 __user *timeout, const struct __compat_aio_sigset __user *usig); asmlinkage long compat_sys_io_pgetevents_time64(compat_aio_context_t ctx_id, compat_long_t min_nr, compat_long_t nr, struct io_event __user *events, struct __kernel_timespec __user *timeout, const struct __compat_aio_sigset __user *usig); asmlinkage long compat_sys_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, int timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); asmlinkage long compat_sys_epoll_pwait2(int epfd, struct epoll_event __user *events, int maxevents, const struct __kernel_timespec __user *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, compat_ulong_t arg); asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg); asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, compat_ulong_t arg); asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf); asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf); asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf); asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf); asmlinkage long compat_sys_truncate(const char __user *, compat_off_t); asmlinkage long compat_sys_ftruncate(unsigned int, compat_ulong_t); /* No generic prototype for truncate64, ftruncate64, fallocate */ asmlinkage long compat_sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); asmlinkage long compat_sys_getdents(unsigned int fd, struct compat_linux_dirent __user *dirent, unsigned int count); asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); /* No generic prototype for pread64 and pwrite64 */ asmlinkage ssize_t compat_sys_preadv(compat_ulong_t fd, const struct iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high); asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd, const struct iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high); #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 asmlinkage long compat_sys_preadv64(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos); #endif #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 asmlinkage long compat_sys_pwritev64(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos); #endif asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, compat_size_t count); asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset, compat_size_t count); asmlinkage long compat_sys_pselect6_time32(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timespec32 __user *tsp, void __user *sig); asmlinkage long compat_sys_pselect6_time64(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct __kernel_timespec __user *tsp, void __user *sig); asmlinkage long compat_sys_ppoll_time32(struct pollfd __user *ufds, unsigned int nfds, struct old_timespec32 __user *tsp, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); asmlinkage long compat_sys_ppoll_time64(struct pollfd __user *ufds, unsigned int nfds, struct __kernel_timespec __user *tsp, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); asmlinkage long compat_sys_signalfd4(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize, int flags); asmlinkage long compat_sys_newfstatat(unsigned int dfd, const char __user *filename, struct compat_stat __user *statbuf, int flag); asmlinkage long compat_sys_newfstat(unsigned int fd, struct compat_stat __user *statbuf); /* No generic prototype for sync_file_range and sync_file_range2 */ asmlinkage long compat_sys_waitid(int, compat_pid_t, struct compat_siginfo __user *, int, struct compat_rusage __user *); asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len); asmlinkage long compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr); asmlinkage long compat_sys_getitimer(int which, struct old_itimerval32 __user *it); asmlinkage long compat_sys_setitimer(int which, struct old_itimerval32 __user *in, struct old_itimerval32 __user *out); asmlinkage long compat_sys_kexec_load(compat_ulong_t entry, compat_ulong_t nr_segments, struct compat_kexec_segment __user *, compat_ulong_t flags); asmlinkage long compat_sys_timer_create(clockid_t which_clock, struct compat_sigevent __user *timer_event_spec, timer_t __user *created_timer_id); asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, compat_long_t addr, compat_long_t data); asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr); asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr); asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, compat_stack_t __user *uoss_ptr); asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize); #ifndef CONFIG_ODD_RT_SIGACTION asmlinkage long compat_sys_rt_sigaction(int, const struct compat_sigaction __user *, struct compat_sigaction __user *, compat_size_t); #endif asmlinkage long compat_sys_rt_sigprocmask(int how, compat_sigset_t __user *set, compat_sigset_t __user *oset, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigtimedwait_time32(compat_sigset_t __user *uthese, struct compat_siginfo __user *uinfo, struct old_timespec32 __user *uts, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigtimedwait_time64(compat_sigset_t __user *uthese, struct compat_siginfo __user *uinfo, struct __kernel_timespec __user *uts, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigqueueinfo(compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); /* No generic prototype for rt_sigreturn */ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf); asmlinkage long compat_sys_getrlimit(unsigned int resource, struct compat_rlimit __user *rlim); asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim); asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru); asmlinkage long compat_sys_gettimeofday(struct old_timeval32 __user *tv, struct timezone __user *tz); asmlinkage long compat_sys_settimeofday(struct old_timeval32 __user *tv, struct timezone __user *tz); asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info); asmlinkage long compat_sys_mq_open(const char __user *u_name, int oflag, compat_mode_t mode, struct compat_mq_attr __user *u_attr); asmlinkage long compat_sys_mq_notify(mqd_t mqdes, const struct compat_sigevent __user *u_notification); asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, const struct compat_mq_attr __user *u_mqstat, struct compat_mq_attr __user *u_omqstat); asmlinkage long compat_sys_msgctl(int first, int second, void __user *uptr); asmlinkage long compat_sys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg); asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, int msgflg); asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg); asmlinkage long compat_sys_shmctl(int first, int second, void __user *uptr); asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg); asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, compat_size_t len, unsigned flags, struct sockaddr __user *addr, int __user *addrlen); asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags); asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags); /* No generic prototype for readahead */ asmlinkage long compat_sys_keyctl(u32 option, u32 arg2, u32 arg3, u32 arg4, u32 arg5); asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, const compat_uptr_t __user *envp); /* No generic prototype for fadvise64_64 */ /* CONFIG_MMU only */ asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); asmlinkage long compat_sys_recvmmsg_time64(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags, struct __kernel_timespec __user *timeout); asmlinkage long compat_sys_recvmmsg_time32(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags, struct old_timespec32 __user *timeout); asmlinkage long compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, struct compat_rusage __user *ru); asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, int, const char __user *); asmlinkage long compat_sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags); asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, const compat_uptr_t __user *argv, const compat_uptr_t __user *envp, int flags); asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd, const struct iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags); asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd, const struct iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags); #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 asmlinkage long compat_sys_preadv64v2(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags); #endif #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 asmlinkage long compat_sys_pwritev64v2(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags); #endif /* * Deprecated system calls which are still defined in * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch */ /* __ARCH_WANT_SYSCALL_NO_AT */ asmlinkage long compat_sys_open(const char __user *filename, int flags, umode_t mode); /* __ARCH_WANT_SYSCALL_NO_FLAGS */ asmlinkage long compat_sys_signalfd(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); /* __ARCH_WANT_SYSCALL_OFF_T */ asmlinkage long compat_sys_newstat(const char __user *filename, struct compat_stat __user *statbuf); asmlinkage long compat_sys_newlstat(const char __user *filename, struct compat_stat __user *statbuf); /* __ARCH_WANT_SYSCALL_DEPRECATED */ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp); asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32); asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len, unsigned flags); /* obsolete */ asmlinkage long compat_sys_old_readdir(unsigned int fd, struct compat_old_linux_dirent __user *, unsigned int count); /* obsolete */ asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg); /* obsolete */ asmlinkage long compat_sys_ipc(u32, int, int, u32, compat_uptr_t, u32); /* obsolete */ #ifdef __ARCH_WANT_SYS_SIGPENDING asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set); #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *nset, compat_old_sigset_t __user *oset); #endif #ifdef CONFIG_COMPAT_OLD_SIGACTION asmlinkage long compat_sys_sigaction(int sig, const struct compat_old_sigaction __user *act, struct compat_old_sigaction __user *oact); #endif /* obsolete */ asmlinkage long compat_sys_socketcall(int call, u32 __user *args); #ifdef __ARCH_WANT_COMPAT_TRUNCATE64 asmlinkage long compat_sys_truncate64(const char __user *pathname, compat_arg_u64(len)); #endif #ifdef __ARCH_WANT_COMPAT_FTRUNCATE64 asmlinkage long compat_sys_ftruncate64(unsigned int fd, compat_arg_u64(len)); #endif #ifdef __ARCH_WANT_COMPAT_FALLOCATE asmlinkage long compat_sys_fallocate(int fd, int mode, compat_arg_u64(offset), compat_arg_u64(len)); #endif #ifdef __ARCH_WANT_COMPAT_PREAD64 asmlinkage long compat_sys_pread64(unsigned int fd, char __user *buf, size_t count, compat_arg_u64(pos)); #endif #ifdef __ARCH_WANT_COMPAT_PWRITE64 asmlinkage long compat_sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, compat_arg_u64(pos)); #endif #ifdef __ARCH_WANT_COMPAT_SYNC_FILE_RANGE asmlinkage long compat_sys_sync_file_range(int fd, compat_arg_u64(pos), compat_arg_u64(nbytes), unsigned int flags); #endif #ifdef __ARCH_WANT_COMPAT_FADVISE64_64 asmlinkage long compat_sys_fadvise64_64(int fd, compat_arg_u64(pos), compat_arg_u64(len), int advice); #endif #ifdef __ARCH_WANT_COMPAT_READAHEAD asmlinkage long compat_sys_readahead(int fd, compat_arg_u64(offset), size_t count); #endif #endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ /** * ns_to_old_timeval32 - Compat version of ns_to_timeval * @nsec: the nanoseconds value to be converted * * Returns the old_timeval32 representation of the nsec parameter. */ static inline struct old_timeval32 ns_to_old_timeval32(s64 nsec) { struct __kernel_old_timeval tv; struct old_timeval32 ctv; tv = ns_to_kernel_old_timeval(nsec); ctv.tv_sec = tv.tv_sec; ctv.tv_usec = tv.tv_usec; return ctv; } /* * Kernel code should not call compat syscalls (i.e., compat_sys_xyzyyz()) * directly. Instead, use one of the functions which work equivalently, such * as the kcompat_sys_xyzyyz() functions prototyped below. */ int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz, struct compat_statfs64 __user * buf); int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user * buf); #ifdef CONFIG_COMPAT /* * For most but not all architectures, "am I in a compat syscall?" and * "am I a compat task?" are the same question. For architectures on which * they aren't the same question, arch code can override in_compat_syscall. */ #ifndef in_compat_syscall static inline bool in_compat_syscall(void) { return is_compat_task(); } #endif #else /* !CONFIG_COMPAT */ #define is_compat_task() (0) /* Ensure no one redefines in_compat_syscall() under !CONFIG_COMPAT */ #define in_compat_syscall in_compat_syscall static inline bool in_compat_syscall(void) { return false; } #endif /* CONFIG_COMPAT */ #define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) #define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size); long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size); /* * Some legacy ABIs like the i386 one use less than natural alignment for 64-bit * types, and will need special compat treatment for that. Most architectures * don't need that special handling even for compat syscalls. */ #ifndef compat_need_64bit_alignment_fixup #define compat_need_64bit_alignment_fixup() false #endif /* * A pointer passed in from user mode. This should not * be used for syscall parameters, just declare them * as pointers because the syscall entry code will have * appropriately converted them already. */ #ifndef compat_ptr static inline void __user *compat_ptr(compat_uptr_t uptr) { return (void __user *)(unsigned long)uptr; } #endif static inline compat_uptr_t ptr_to_compat(void __user *uptr) { return (u32)(unsigned long)uptr; } #endif /* _LINUX_COMPAT_H */ |
| 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 | // SPDX-License-Identifier: GPL-2.0 /* * Filesystem-level keyring for fscrypt * * Copyright 2019 Google LLC */ /* * This file implements management of fscrypt master keys in the * filesystem-level keyring, including the ioctls: * * - FS_IOC_ADD_ENCRYPTION_KEY * - FS_IOC_REMOVE_ENCRYPTION_KEY * - FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS * - FS_IOC_GET_ENCRYPTION_KEY_STATUS * * See the "User API" section of Documentation/filesystems/fscrypt.rst for more * information about these ioctls. */ #include <asm/unaligned.h> #include <crypto/skcipher.h> #include <linux/key-type.h> #include <linux/random.h> #include <linux/seq_file.h> #include "fscrypt_private.h" /* The master encryption keys for a filesystem (->s_master_keys) */ struct fscrypt_keyring { /* * Lock that protects ->key_hashtable. It does *not* protect the * fscrypt_master_key structs themselves. */ spinlock_t lock; /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */ struct hlist_head key_hashtable[128]; }; static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret) { fscrypt_destroy_hkdf(&secret->hkdf); memzero_explicit(secret, sizeof(*secret)); } static void move_master_key_secret(struct fscrypt_master_key_secret *dst, struct fscrypt_master_key_secret *src) { memcpy(dst, src, sizeof(*dst)); memzero_explicit(src, sizeof(*src)); } static void fscrypt_free_master_key(struct rcu_head *head) { struct fscrypt_master_key *mk = container_of(head, struct fscrypt_master_key, mk_rcu_head); /* * The master key secret and any embedded subkeys should have already * been wiped when the last active reference to the fscrypt_master_key * struct was dropped; doing it here would be unnecessarily late. * Nevertheless, use kfree_sensitive() in case anything was missed. */ kfree_sensitive(mk); } void fscrypt_put_master_key(struct fscrypt_master_key *mk) { if (!refcount_dec_and_test(&mk->mk_struct_refs)) return; /* * No structural references left, so free ->mk_users, and also free the * fscrypt_master_key struct itself after an RCU grace period ensures * that concurrent keyring lookups can no longer find it. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 0); key_put(mk->mk_users); mk->mk_users = NULL; call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key); } void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk) { size_t i; if (!refcount_dec_and_test(&mk->mk_active_refs)) return; /* * No active references left, so complete the full removal of this * fscrypt_master_key struct by removing it from the keyring and * destroying any subkeys embedded in it. */ if (WARN_ON_ONCE(!sb->s_master_keys)) return; spin_lock(&sb->s_master_keys->lock); hlist_del_rcu(&mk->mk_node); spin_unlock(&sb->s_master_keys->lock); /* * ->mk_active_refs == 0 implies that ->mk_present is false and * ->mk_decrypted_inodes is empty. */ WARN_ON_ONCE(mk->mk_present); WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes)); for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { fscrypt_destroy_prepared_key( sb, &mk->mk_direct_keys[i]); fscrypt_destroy_prepared_key( sb, &mk->mk_iv_ino_lblk_64_keys[i]); fscrypt_destroy_prepared_key( sb, &mk->mk_iv_ino_lblk_32_keys[i]); } memzero_explicit(&mk->mk_ino_hash_key, sizeof(mk->mk_ino_hash_key)); mk->mk_ino_hash_key_initialized = false; /* Drop the structural ref associated with the active refs. */ fscrypt_put_master_key(mk); } /* * This transitions the key state from present to incompletely removed, and then * potentially to absent (depending on whether inodes remain). */ static void fscrypt_initiate_key_removal(struct super_block *sb, struct fscrypt_master_key *mk) { WRITE_ONCE(mk->mk_present, false); wipe_master_key_secret(&mk->mk_secret); fscrypt_put_master_key_activeref(sb, mk); } static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) { if (spec->__reserved) return false; return master_key_spec_len(spec) != 0; } static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { /* * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for * each key, regardless of the exact key size. The amount of memory * actually used is greater than the size of the raw key anyway. */ return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE); } static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); } /* * Type of key in ->mk_users. Each key of this type represents a particular * user who has added a particular master key. * * Note that the name of this key type really should be something like * ".fscrypt-user" instead of simply ".fscrypt". But the shorter name is chosen * mainly for simplicity of presentation in /proc/keys when read by a non-root * user. And it is expected to be rare that a key is actually added by multiple * users, since users should keep their encryption keys confidential. */ static struct key_type key_type_fscrypt_user = { .name = ".fscrypt", .instantiate = fscrypt_user_key_instantiate, .describe = fscrypt_user_key_describe, }; #define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \ (CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \ CONST_STRLEN("-users") + 1) #define FSCRYPT_MK_USER_DESCRIPTION_SIZE \ (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1) static void format_mk_users_keyring_description( char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { sprintf(description, "fscrypt-%*phN-users", FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier); } static void format_mk_user_description( char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE], const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { sprintf(description, "%*phN.uid.%u", FSCRYPT_KEY_IDENTIFIER_SIZE, mk_identifier, __kuid_val(current_fsuid())); } /* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */ static int allocate_filesystem_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring; if (sb->s_master_keys) return 0; keyring = kzalloc(sizeof(*keyring), GFP_KERNEL); if (!keyring) return -ENOMEM; spin_lock_init(&keyring->lock); /* * Pairs with the smp_load_acquire() in fscrypt_find_master_key(). * I.e., here we publish ->s_master_keys with a RELEASE barrier so that * concurrent tasks can ACQUIRE it. */ smp_store_release(&sb->s_master_keys, keyring); return 0; } /* * Release all encryption keys that have been added to the filesystem, along * with the keyring that contains them. * * This is called at unmount time, after all potentially-encrypted inodes have * been evicted. The filesystem's underlying block device(s) are still * available at this time; this is important because after user file accesses * have been allowed, this function may need to evict keys from the keyslots of * an inline crypto engine, which requires the block device(s). */ void fscrypt_destroy_keyring(struct super_block *sb) { struct fscrypt_keyring *keyring = sb->s_master_keys; size_t i; if (!keyring) return; for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) { struct hlist_head *bucket = &keyring->key_hashtable[i]; struct fscrypt_master_key *mk; struct hlist_node *tmp; hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) { /* * Since all potentially-encrypted inodes were already * evicted, every key remaining in the keyring should * have an empty inode list, and should only still be in * the keyring due to the single active ref associated * with ->mk_present. There should be no structural * refs beyond the one associated with the active ref. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1); WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1); WARN_ON_ONCE(!mk->mk_present); fscrypt_initiate_key_removal(sb, mk); } } kfree_sensitive(keyring); sb->s_master_keys = NULL; } static struct hlist_head * fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring, const struct fscrypt_key_specifier *mk_spec) { /* * Since key specifiers should be "random" values, it is sufficient to * use a trivial hash function that just takes the first several bits of * the key specifier. */ unsigned long i = get_unaligned((unsigned long *)&mk_spec->u); return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)]; } /* * Find the specified master key struct in ->s_master_keys and take a structural * ref to it. The structural ref guarantees that the key struct continues to * exist, but it does *not* guarantee that ->s_master_keys continues to contain * the key struct. The structural ref needs to be dropped by * fscrypt_put_master_key(). Returns NULL if the key struct is not found. */ struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec) { struct fscrypt_keyring *keyring; struct hlist_head *bucket; struct fscrypt_master_key *mk; /* * Pairs with the smp_store_release() in allocate_filesystem_keyring(). * I.e., another task can publish ->s_master_keys concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ keyring = smp_load_acquire(&sb->s_master_keys); if (keyring == NULL) return NULL; /* No keyring yet, so no keys yet. */ bucket = fscrypt_mk_hash_bucket(keyring, mk_spec); rcu_read_lock(); switch (mk_spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: hlist_for_each_entry_rcu(mk, bucket, mk_node) { if (mk->mk_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && memcmp(mk->mk_spec.u.descriptor, mk_spec->u.descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 && refcount_inc_not_zero(&mk->mk_struct_refs)) goto out; } break; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: hlist_for_each_entry_rcu(mk, bucket, mk_node) { if (mk->mk_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && memcmp(mk->mk_spec.u.identifier, mk_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 && refcount_inc_not_zero(&mk->mk_struct_refs)) goto out; } break; } mk = NULL; out: rcu_read_unlock(); return mk; } static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE]; struct key *keyring; format_mk_users_keyring_description(description, mk->mk_spec.u.identifier); keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); mk->mk_users = keyring; return 0; } /* * Find the current user's "key" in the master key's ->mk_users. * Returns ERR_PTR(-ENOKEY) if not found. */ static struct key *find_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; key_ref_t keyref; format_mk_user_description(description, mk->mk_spec.u.identifier); /* * We need to mark the keyring reference as "possessed" so that we * acquire permission to search it, via the KEY_POS_SEARCH permission. */ keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/), &key_type_fscrypt_user, description, false); if (IS_ERR(keyref)) { if (PTR_ERR(keyref) == -EAGAIN || /* not found */ PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */ keyref = ERR_PTR(-ENOKEY); return ERR_CAST(keyref); } return key_ref_to_ptr(keyref); } /* * Give the current user a "key" in ->mk_users. This charges the user's quota * and marks the master key as added by the current user, so that it cannot be * removed by another user with the key. Either ->mk_sem must be held for * write, or the master key must be still undergoing initialization. */ static int add_master_key_user(struct fscrypt_master_key *mk) { char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE]; struct key *mk_user; int err; format_mk_user_description(description, mk->mk_spec.u.identifier); mk_user = key_alloc(&key_type_fscrypt_user, description, current_fsuid(), current_gid(), current_cred(), KEY_POS_SEARCH | KEY_USR_VIEW, 0, NULL); if (IS_ERR(mk_user)) return PTR_ERR(mk_user); err = key_instantiate_and_link(mk_user, NULL, 0, mk->mk_users, NULL); key_put(mk_user); return err; } /* * Remove the current user's "key" from ->mk_users. * ->mk_sem must be held for write. * * Returns 0 if removed, -ENOKEY if not found, or another -errno code. */ static int remove_master_key_user(struct fscrypt_master_key *mk) { struct key *mk_user; int err; mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) return PTR_ERR(mk_user); err = key_unlink(mk->mk_users, mk_user); key_put(mk_user); return err; } /* * Allocate a new fscrypt_master_key, transfer the given secret over to it, and * insert it into sb->s_master_keys. */ static int add_new_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, const struct fscrypt_key_specifier *mk_spec) { struct fscrypt_keyring *keyring = sb->s_master_keys; struct fscrypt_master_key *mk; int err; mk = kzalloc(sizeof(*mk), GFP_KERNEL); if (!mk) return -ENOMEM; init_rwsem(&mk->mk_sem); refcount_set(&mk->mk_struct_refs, 1); mk->mk_spec = *mk_spec; INIT_LIST_HEAD(&mk->mk_decrypted_inodes); spin_lock_init(&mk->mk_decrypted_inodes_lock); if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { err = allocate_master_key_users_keyring(mk); if (err) goto out_put; err = add_master_key_user(mk); if (err) goto out_put; } move_master_key_secret(&mk->mk_secret, secret); mk->mk_present = true; refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */ spin_lock(&keyring->lock); hlist_add_head_rcu(&mk->mk_node, fscrypt_mk_hash_bucket(keyring, mk_spec)); spin_unlock(&keyring->lock); return 0; out_put: fscrypt_put_master_key(mk); return err; } #define KEY_DEAD 1 static int add_existing_master_key(struct fscrypt_master_key *mk, struct fscrypt_master_key_secret *secret) { int err; /* * If the current user is already in ->mk_users, then there's nothing to * do. Otherwise, we need to add the user to ->mk_users. (Neither is * applicable for v1 policy keys, which have NULL ->mk_users.) */ if (mk->mk_users) { struct key *mk_user = find_master_key_user(mk); if (mk_user != ERR_PTR(-ENOKEY)) { if (IS_ERR(mk_user)) return PTR_ERR(mk_user); key_put(mk_user); return 0; } err = add_master_key_user(mk); if (err) return err; } /* If the key is incompletely removed, make it present again. */ if (!mk->mk_present) { if (!refcount_inc_not_zero(&mk->mk_active_refs)) { /* * Raced with the last active ref being dropped, so the * key has become, or is about to become, "absent". * Therefore, we need to allocate a new key struct. */ return KEY_DEAD; } move_master_key_secret(&mk->mk_secret, secret); WRITE_ONCE(mk->mk_present, true); } return 0; } static int do_add_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); struct fscrypt_master_key *mk; int err; mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */ mk = fscrypt_find_master_key(sb, mk_spec); if (!mk) { /* Didn't find the key in ->s_master_keys. Add it. */ err = allocate_filesystem_keyring(sb); if (!err) err = add_new_master_key(sb, secret, mk_spec); } else { /* * Found the key in ->s_master_keys. Add the user to ->mk_users * if needed, and make the key "present" again if possible. */ down_write(&mk->mk_sem); err = add_existing_master_key(mk, secret); up_write(&mk->mk_sem); if (err == KEY_DEAD) { /* * We found a key struct, but it's already been fully * removed. Ignore the old struct and add a new one. * fscrypt_add_key_mutex means we don't need to worry * about concurrent adds. */ err = add_new_master_key(sb, secret, mk_spec); } fscrypt_put_master_key(mk); } mutex_unlock(&fscrypt_add_key_mutex); return err; } static int add_master_key(struct super_block *sb, struct fscrypt_master_key_secret *secret, struct fscrypt_key_specifier *key_spec) { int err; if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { err = fscrypt_init_hkdf(&secret->hkdf, secret->raw, secret->size); if (err) return err; /* * Now that the HKDF context is initialized, the raw key is no * longer needed. */ memzero_explicit(secret->raw, secret->size); /* Calculate the key identifier */ err = fscrypt_hkdf_expand(&secret->hkdf, HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0, key_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); if (err) return err; } return do_add_master_key(sb, secret, key_spec); } static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; if (prep->datalen < sizeof(*payload) + FSCRYPT_MIN_KEY_SIZE || prep->datalen > sizeof(*payload) + FSCRYPT_MAX_KEY_SIZE) return -EINVAL; if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) return -EINVAL; if (payload->__reserved) return -EINVAL; prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL); if (!prep->payload.data[0]) return -ENOMEM; prep->quotalen = prep->datalen; return 0; } static void fscrypt_provisioning_key_free_preparse( struct key_preparsed_payload *prep) { kfree_sensitive(prep->payload.data[0]); } static void fscrypt_provisioning_key_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); if (key_is_positive(key)) { const struct fscrypt_provisioning_key_payload *payload = key->payload.data[0]; seq_printf(m, ": %u [%u]", key->datalen, payload->type); } } static void fscrypt_provisioning_key_destroy(struct key *key) { kfree_sensitive(key->payload.data[0]); } static struct key_type key_type_fscrypt_provisioning = { .name = "fscrypt-provisioning", .preparse = fscrypt_provisioning_key_preparse, .free_preparse = fscrypt_provisioning_key_free_preparse, .instantiate = generic_key_instantiate, .describe = fscrypt_provisioning_key_describe, .destroy = fscrypt_provisioning_key_destroy, }; /* * Retrieve the raw key from the Linux keyring key specified by 'key_id', and * store it into 'secret'. * * The key must be of type "fscrypt-provisioning" and must have the field * fscrypt_provisioning_key_payload::type set to 'type', indicating that it's * only usable with fscrypt with the particular KDF version identified by * 'type'. We don't use the "logon" key type because there's no way to * completely restrict the use of such keys; they can be used by any kernel API * that accepts "logon" keys and doesn't require a specific service prefix. * * The ability to specify the key via Linux keyring key is intended for cases * where userspace needs to re-add keys after the filesystem is unmounted and * re-mounted. Most users should just provide the raw key directly instead. */ static int get_keyring_key(u32 key_id, u32 type, struct fscrypt_master_key_secret *secret) { key_ref_t ref; struct key *key; const struct fscrypt_provisioning_key_payload *payload; int err; ref = lookup_user_key(key_id, 0, KEY_NEED_SEARCH); if (IS_ERR(ref)) return PTR_ERR(ref); key = key_ref_to_ptr(ref); if (key->type != &key_type_fscrypt_provisioning) goto bad_key; payload = key->payload.data[0]; /* Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. */ if (payload->type != type) goto bad_key; secret->size = key->datalen - sizeof(*payload); memcpy(secret->raw, payload->raw, secret->size); err = 0; goto out_put; bad_key: err = -EKEYREJECTED; out_put: key_ref_put(ref); return err; } /* * Add a master encryption key to the filesystem, causing all files which were * encrypted with it to appear "unlocked" (decrypted) when accessed. * * When adding a key for use by v1 encryption policies, this ioctl is * privileged, and userspace must provide the 'key_descriptor'. * * When adding a key for use by v2+ encryption policies, this ioctl is * unprivileged. This is needed, in general, to allow non-root users to use * encryption without encountering the visibility problems of process-subscribed * keyrings and the inability to properly remove keys. This works by having * each key identified by its cryptographically secure hash --- the * 'key_identifier'. The cryptographic hash ensures that a malicious user * cannot add the wrong key for a given identifier. Furthermore, each added key * is charged to the appropriate user's quota for the keyrings service, which * prevents a malicious user from adding too many keys. Finally, we forbid a * user from removing a key while other users have added it too, which prevents * a user who knows another user's key from causing a denial-of-service by * removing it at an inopportune time. (We tolerate that a user who knows a key * can prevent other users from removing it.) * * For more details, see the "FS_IOC_ADD_ENCRYPTION_KEY" section of * Documentation/filesystems/fscrypt.rst. */ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_add_key_arg __user *uarg = _uarg; struct fscrypt_add_key_arg arg; struct fscrypt_master_key_secret secret; int err; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; /* * Only root can add keys that are identified by an arbitrary descriptor * rather than by a cryptographic hash --- since otherwise a malicious * user could add the wrong key. */ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && !capable(CAP_SYS_ADMIN)) return -EACCES; memset(&secret, 0, sizeof(secret)); if (arg.key_id) { if (arg.raw_size != 0) return -EINVAL; err = get_keyring_key(arg.key_id, arg.key_spec.type, &secret); if (err) goto out_wipe_secret; } else { if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || arg.raw_size > FSCRYPT_MAX_KEY_SIZE) return -EINVAL; secret.size = arg.raw_size; err = -EFAULT; if (copy_from_user(secret.raw, uarg->raw, secret.size)) goto out_wipe_secret; } err = add_master_key(sb, &secret, &arg.key_spec); if (err) goto out_wipe_secret; /* Return the key identifier to userspace, if applicable */ err = -EFAULT; if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE)) goto out_wipe_secret; err = 0; out_wipe_secret: wipe_master_key_secret(&secret); return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); static void fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) { static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); memset(secret, 0, sizeof(*secret)); secret->size = FSCRYPT_MAX_KEY_SIZE; memcpy(secret->raw, test_key, FSCRYPT_MAX_KEY_SIZE); } int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_master_key_secret secret; int err; fscrypt_get_test_dummy_secret(&secret); err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); if (err) goto out; err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0, key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); out: wipe_master_key_secret(&secret); return err; } /** * fscrypt_add_test_dummy_key() - add the test dummy encryption key * @sb: the filesystem instance to add the key to * @key_spec: the key specifier of the test dummy encryption key * * Add the key for the test_dummy_encryption mount option to the filesystem. To * prevent misuse of this mount option, a per-boot random key is used instead of * a hardcoded one. This makes it so that any encrypted files created using * this option won't be accessible after a reboot. * * Return: 0 on success, -errno on failure */ int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec) { struct fscrypt_master_key_secret secret; int err; fscrypt_get_test_dummy_secret(&secret); err = add_master_key(sb, &secret, key_spec); wipe_master_key_secret(&secret); return err; } /* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting * their files using some other user's key which they don't actually know. * Cryptographically this isn't much of a problem, but the semantics of this * would be a bit weird, so it's best to just forbid it. * * The system administrator (CAP_FOWNER) can override this, which should be * enough for any use cases where encryption policies are being set using keys * that were chosen ahead of time but aren't available at the moment. * * Note that the key may have already removed by the time this returns, but * that's okay; we just care whether the key was there at some point. * * Return: 0 if the key is added, -ENOKEY if it isn't, or another -errno code */ int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]) { struct fscrypt_key_specifier mk_spec; struct fscrypt_master_key *mk; struct key *mk_user; int err; mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); mk = fscrypt_find_master_key(sb, &mk_spec); if (!mk) { err = -ENOKEY; goto out; } down_read(&mk->mk_sem); mk_user = find_master_key_user(mk); if (IS_ERR(mk_user)) { err = PTR_ERR(mk_user); } else { key_put(mk_user); err = 0; } up_read(&mk->mk_sem); fscrypt_put_master_key(mk); out: if (err == -ENOKEY && capable(CAP_FOWNER)) err = 0; return err; } /* * Try to evict the inode's dentries from the dentry cache. If the inode is a * directory, then it can have at most one dentry; however, that dentry may be * pinned by child dentries, so first try to evict the children too. */ static void shrink_dcache_inode(struct inode *inode) { struct dentry *dentry; if (S_ISDIR(inode->i_mode)) { dentry = d_find_any_alias(inode); if (dentry) { shrink_dcache_parent(dentry); dput(dentry); } } d_prune_aliases(inode); } static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) { struct fscrypt_inode_info *ci; struct inode *inode; struct inode *toput_inode = NULL; spin_lock(&mk->mk_decrypted_inodes_lock); list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) { inode = ci->ci_inode; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&mk->mk_decrypted_inodes_lock); shrink_dcache_inode(inode); iput(toput_inode); toput_inode = inode; spin_lock(&mk->mk_decrypted_inodes_lock); } spin_unlock(&mk->mk_decrypted_inodes_lock); iput(toput_inode); } static int check_for_busy_inodes(struct super_block *sb, struct fscrypt_master_key *mk) { struct list_head *pos; size_t busy_count = 0; unsigned long ino; char ino_str[50] = ""; spin_lock(&mk->mk_decrypted_inodes_lock); list_for_each(pos, &mk->mk_decrypted_inodes) busy_count++; if (busy_count == 0) { spin_unlock(&mk->mk_decrypted_inodes_lock); return 0; } { /* select an example file to show for debugging purposes */ struct inode *inode = list_first_entry(&mk->mk_decrypted_inodes, struct fscrypt_inode_info, ci_master_key_link)->ci_inode; ino = inode->i_ino; } spin_unlock(&mk->mk_decrypted_inodes_lock); /* If the inode is currently being created, ino may still be 0. */ if (ino) snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); fscrypt_warn(NULL, "%s: %zu inode(s) still busy after removing key with %s %*phN%s", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, ino_str); return -EBUSY; } static int try_to_lock_encrypted_files(struct super_block *sb, struct fscrypt_master_key *mk) { int err1; int err2; /* * An inode can't be evicted while it is dirty or has dirty pages. * Thus, we first have to clean the inodes in ->mk_decrypted_inodes. * * Just do it the easy way: call sync_filesystem(). It's overkill, but * it works, and it's more important to minimize the amount of caches we * drop than the amount of data we sync. Also, unprivileged users can * already call sync_filesystem() via sys_syncfs() or sys_sync(). */ down_read(&sb->s_umount); err1 = sync_filesystem(sb); up_read(&sb->s_umount); /* If a sync error occurs, still try to evict as much as possible. */ /* * Inodes are pinned by their dentries, so we have to evict their * dentries. shrink_dcache_sb() would suffice, but would be overkill * and inappropriate for use by unprivileged users. So instead go * through the inodes' alias lists and try to evict each dentry. */ evict_dentries_for_decrypted_inodes(mk); /* * evict_dentries_for_decrypted_inodes() already iput() each inode in * the list; any inodes for which that dropped the last reference will * have been evicted due to fscrypt_drop_inode() detecting the key * removal and telling the VFS to evict the inode. So to finish, we * just need to check whether any inodes couldn't be evicted. */ err2 = check_for_busy_inodes(sb, mk); return err1 ?: err2; } /* * Try to remove an fscrypt master encryption key. * * FS_IOC_REMOVE_ENCRYPTION_KEY (all_users=false) removes the current user's * claim to the key, then removes the key itself if no other users have claims. * FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the * key itself. * * To "remove the key itself", first we transition the key to the "incompletely * removed" state, so that no more inodes can be unlocked with it. Then we try * to evict all cached inodes that had been unlocked with the key. * * If all inodes were evicted, then we unlink the fscrypt_master_key from the * keyring. Otherwise it remains in the keyring in the "incompletely removed" * state where it tracks the list of remaining inodes. Userspace can execute * the ioctl again later to retry eviction, or alternatively can re-add the key. * * For more details, see the "Removing keys" section of * Documentation/filesystems/fscrypt.rst. */ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_remove_key_arg __user *uarg = _uarg; struct fscrypt_remove_key_arg arg; struct fscrypt_master_key *mk; u32 status_flags = 0; int err; bool inodes_remain; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; /* * Only root can add and remove keys that are identified by an arbitrary * descriptor rather than by a cryptographic hash. */ if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && !capable(CAP_SYS_ADMIN)) return -EACCES; /* Find the key being removed. */ mk = fscrypt_find_master_key(sb, &arg.key_spec); if (!mk) return -ENOKEY; down_write(&mk->mk_sem); /* If relevant, remove current user's (or all users) claim to the key */ if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) { if (all_users) err = keyring_clear(mk->mk_users); else err = remove_master_key_user(mk); if (err) { up_write(&mk->mk_sem); goto out_put_key; } if (mk->mk_users->keys.nr_leaves_on_tree != 0) { /* * Other users have still added the key too. We removed * the current user's claim to the key, but we still * can't remove the key itself. */ status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS; err = 0; up_write(&mk->mk_sem); goto out_put_key; } } /* No user claims remaining. Initiate removal of the key. */ err = -ENOKEY; if (mk->mk_present) { fscrypt_initiate_key_removal(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; up_write(&mk->mk_sem); if (inodes_remain) { /* Some inodes still reference this key; try to evict them. */ err = try_to_lock_encrypted_files(sb, mk); if (err == -EBUSY) { status_flags |= FSCRYPT_KEY_REMOVAL_STATUS_FLAG_FILES_BUSY; err = 0; } } /* * We return 0 if we successfully did something: removed a claim to the * key, initiated removal of the key, or tried locking the files again. * Users need to check the informational status flags if they care * whether the key has been fully removed including all files locked. */ out_put_key: fscrypt_put_master_key(mk); if (err == 0) err = put_user(status_flags, &uarg->removal_status_flags); return err; } int fscrypt_ioctl_remove_key(struct file *filp, void __user *uarg) { return do_remove_key(filp, uarg, false); } EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *uarg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; return do_remove_key(filp, uarg, true); } EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users); /* * Retrieve the status of an fscrypt master encryption key. * * We set ->status to indicate whether the key is absent, present, or * incompletely removed. (For an explanation of what these statuses mean and * how they are represented internally, see struct fscrypt_master_key.) This * field allows applications to easily determine the status of an encrypted * directory without using a hack such as trying to open a regular file in it * (which can confuse the "incompletely removed" status with absent or present). * * In addition, for v2 policy keys we allow applications to determine, via * ->status_flags and ->user_count, whether the key has been added by the * current user, by other users, or by both. Most applications should not need * this, since ordinarily only one user should know a given key. However, if a * secret key is shared by multiple users, applications may wish to add an * already-present key to prevent other users from removing it. This ioctl can * be used to check whether that really is the case before the work is done to * add the key --- which might e.g. require prompting the user for a passphrase. * * For more details, see the "FS_IOC_GET_ENCRYPTION_KEY_STATUS" section of * Documentation/filesystems/fscrypt.rst. */ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) { struct super_block *sb = file_inode(filp)->i_sb; struct fscrypt_get_key_status_arg arg; struct fscrypt_master_key *mk; int err; if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; if (!valid_key_spec(&arg.key_spec)) return -EINVAL; if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; arg.status_flags = 0; arg.user_count = 0; memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved)); mk = fscrypt_find_master_key(sb, &arg.key_spec); if (!mk) { arg.status = FSCRYPT_KEY_STATUS_ABSENT; err = 0; goto out; } down_read(&mk->mk_sem); if (!mk->mk_present) { arg.status = refcount_read(&mk->mk_active_refs) > 0 ? FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; err = 0; goto out_release_key; } arg.status = FSCRYPT_KEY_STATUS_PRESENT; if (mk->mk_users) { struct key *mk_user; arg.user_count = mk->mk_users->keys.nr_leaves_on_tree; mk_user = find_master_key_user(mk); if (!IS_ERR(mk_user)) { arg.status_flags |= FSCRYPT_KEY_STATUS_FLAG_ADDED_BY_SELF; key_put(mk_user); } else if (mk_user != ERR_PTR(-ENOKEY)) { err = PTR_ERR(mk_user); goto out_release_key; } } err = 0; out_release_key: up_read(&mk->mk_sem); fscrypt_put_master_key(mk); out: if (!err && copy_to_user(uarg, &arg, sizeof(arg))) err = -EFAULT; return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_get_key_status); int __init fscrypt_init_keyring(void) { int err; err = register_key_type(&key_type_fscrypt_user); if (err) return err; err = register_key_type(&key_type_fscrypt_provisioning); if (err) goto err_unregister_fscrypt_user; return 0; err_unregister_fscrypt_user: unregister_key_type(&key_type_fscrypt_user); return err; } |
| 10 10 10 10 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc(sizeof(*failover), GFP_KERNEL); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2"); |
| 10 10 10 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <net/flow_offload.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); if (!flow) return NULL; flow->rule = flow_rule_alloc(num_actions); if (!flow->rule) { kfree(flow); return NULL; } flow->rule->match.dissector = &flow->match.dissector; flow->rule->match.mask = &flow->match.mask; flow->rule->match.key = &flow->match.key; return flow; } void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type) { struct nft_flow_match *match = &flow->match; struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } struct nft_offload_ethertype { __be16 value; __be16 mask; }; static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; struct nft_offload_ethertype ethertype = { .value = match->key.basic.n_proto, .mask = match->mask.basic.n_proto, }; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); } else if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->offload_action && expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); } if (num_actions == 0) return ERR_PTR(-EOPNOTSUPP); flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto err_out; } ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } nft_flow_rule_transfer_vlan(ctx, flow); flow->proto = ctx->dep.l3num; kfree(ctx); return flow; err_out: kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); } void nft_flow_rule_destroy(struct nft_flow_rule *flow) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, &flow->rule->action) { switch (entry->id) { case FLOW_ACTION_REDIRECT: case FLOW_ACTION_MIRRED: dev_put(entry->dev); break; default: break; } } kfree(flow->rule); kfree(flow); } void nft_offload_set_dependency(struct nft_offload_ctx *ctx, enum nft_offload_dep_type type) { ctx->dep.type = type; } void nft_offload_update_dependency(struct nft_offload_ctx *ctx, const void *data, u32 len) { switch (ctx->dep.type) { case NFT_OFFLOAD_DEP_NETWORK: WARN_ON(len != sizeof(__u16)); memcpy(&ctx->dep.l3num, data, sizeof(__u16)); break; case NFT_OFFLOAD_DEP_TRANSPORT: WARN_ON(len != sizeof(__u8)); memcpy(&ctx->dep.protonum, data, sizeof(__u8)); break; default: break; } ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, __be16 proto, int priority, struct netlink_ext_ack *extack) { common->protocol = proto; common->prio = priority; common->extack = extack; } static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; } return 0; } static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) return -1; return 0; } bool nft_chain_offload_support(const struct nft_base_chain *basechain) { struct net_device *dev; struct nft_hook *hook; if (nft_chain_offload_priority(basechain) < 0) return false; list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.pf != NFPROTO_NETDEV || hook->ops.hooknum != NF_NETDEV_INGRESS) return false; dev = hook->ops.dev; if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) return false; } return true; } static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, const struct nft_flow_rule *flow, struct netlink_ext_ack *extack, enum flow_cls_command command) { __be16 proto = ETH_P_ALL; memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; nft_flow_offload_common_init(&cls_flow->common, proto, basechain->ops.priority, extack); cls_flow->command = command; cls_flow->cookie = (unsigned long) rule; if (flow) cls_flow->rule = flow->rule; } static int nft_flow_offload_cmd(const struct nft_chain *chain, const struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command, struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } static int nft_flow_offload_rule(const struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { struct flow_cls_offload cls_flow; return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); } int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule) { struct flow_cls_offload cls_flow = {}; struct nft_expr *expr, *next; int err; err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, &cls_flow); if (err < 0) return err; nft_rule_for_each_expr(expr, next, rule) { if (expr->ops->offload_stats) expr->ops->offload_stats(expr, &cls_flow.stats); } return 0; } static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { list_splice(&bo->cb_list, &basechain->flow_block.cb_list); return 0; } static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; struct flow_cls_offload cls_flow; struct netlink_ext_ack extack; struct nft_chain *chain; struct nft_rule *rule; chain = &basechain->chain; list_for_each_entry(rule, &chain->rules, list) { memset(&extack, 0, sizeof(extack)); nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, &extack, FLOW_CLS_DESTROY); nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); flow_block_cb_free(block_cb); } return 0; } static int nft_block_setup(struct nft_base_chain *basechain, struct flow_block_offload *bo, enum flow_block_command cmd) { int err; switch (cmd) { case FLOW_BLOCK_BIND: err = nft_flow_offload_bind(bo, basechain); break; case FLOW_BLOCK_UNBIND: err = nft_flow_offload_unbind(bo, basechain); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } static void nft_flow_block_offload_init(struct flow_block_offload *bo, struct net *net, enum flow_block_command cmd, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { memset(bo, 0, sizeof(*bo)); bo->net = net; bo->block = &basechain->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) return err; return nft_block_setup(chain, &bo, cmd); } static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; return nft_block_setup(basechain, &bo, cmd); } static int nft_chain_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { int err; if (dev->netdev_ops->ndo_setup_tc) err = nft_block_offload_cmd(basechain, dev, cmd); else err = nft_indr_block_offload_cmd(basechain, dev, cmd); return err; } static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { struct net_device *dev; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { dev = hook->ops.dev; if (this_dev && this_dev != dev) continue; err = nft_chain_offload_cmd(basechain, dev, cmd); if (err < 0 && cmd == FLOW_BLOCK_BIND) { if (!this_dev) goto err_flow_block; return err; } i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { if (i-- <= 0) break; dev = hook->ops.dev; nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); } return err; } static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; return nft_flow_block_chain(basechain, NULL, cmd); } static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; err = nft_flow_offload_chain(trans->ctx.chain, NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_chain(trans->ctx.chain, NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; } if (WARN_ON_ONCE(err)) break; } } int nft_flow_rule_offload_commit(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(trans->ctx.chain, &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(trans->ctx.chain, &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->ctx.flags & NLM_F_REPLACE || !(trans->ctx.flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(trans->ctx.chain, nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; } if (err) { nft_flow_rule_offload_abort(net, trans); break; } } return err; } static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, struct net_device *dev) { struct nft_base_chain *basechain; struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain) || !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.dev != dev) continue; found = hook; break; } if (!found) continue; return chain; } } return NULL; } static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { unregister_netdevice_notifier(&nft_offload_netdev_notifier); } |
| 1 1 5 5 3 5 1 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 | // SPDX-License-Identifier: GPL-2.0+ /* * Driver for USB Mass Storage compliant devices * * Current development and maintenance by: * (c) 1999-2003 Matthew Dharm (mdharm-usb@one-eyed-alien.net) * * Developed with the assistance of: * (c) 2000 David L. Brown, Jr. (usb-storage@davidb.org) * (c) 2003-2009 Alan Stern (stern@rowland.harvard.edu) * * Initial work by: * (c) 1999 Michael Gee (michael@linuxspecific.com) * * usb_device_id support by Adam J. Richter (adam@yggdrasil.com): * (c) 2000 Yggdrasil Computing, Inc. * * This driver is based on the 'USB Mass Storage Class' document. This * describes in detail the protocol used to communicate with such * devices. Clearly, the designers had SCSI and ATAPI commands in * mind when they created this document. The commands are all very * similar to commands in the SCSI-II and ATAPI specifications. * * It is important to note that in a number of cases this class * exhibits class-specific exemptions from the USB specification. * Notably the usage of NAK, STALL and ACK differs from the norm, in * that they are used to communicate wait, failed and OK on commands. * * Also, for certain devices, the interrupt endpoint is used to convey * status of a command. */ #ifdef CONFIG_USB_STORAGE_DEBUG #define DEBUG #endif #include <linux/sched.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/utsname.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include "usb.h" #include "scsiglue.h" #include "transport.h" #include "protocol.h" #include "debug.h" #include "initializers.h" #include "sierra_ms.h" #include "option_ms.h" #if IS_ENABLED(CONFIG_USB_UAS) #include "uas-detect.h" #endif #define DRV_NAME "usb-storage" /* Some informational data */ MODULE_AUTHOR("Matthew Dharm <mdharm-usb@one-eyed-alien.net>"); MODULE_DESCRIPTION("USB Mass Storage driver for Linux"); MODULE_LICENSE("GPL"); static unsigned int delay_use = 1; module_param(delay_use, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(delay_use, "seconds to delay before using a new device"); static char quirks[128]; module_param_string(quirks, quirks, sizeof(quirks), S_IRUGO | S_IWUSR); MODULE_PARM_DESC(quirks, "supplemental list of device IDs and their quirks"); /* * The entries in this table correspond, line for line, * with the entries in usb_storage_usb_ids[], defined in usual-tables.c. */ /* *The vendor name should be kept at eight characters or less, and * the product name should be kept at 16 characters or less. If a device * has the US_FL_FIX_INQUIRY flag, then the vendor and product names * normally generated by a device through the INQUIRY response will be * taken from this list, and this is the reason for the above size * restriction. However, if the flag is not present, then you * are free to use as many characters as you like. */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } #define COMPLIANT_DEV UNUSUAL_DEV #define USUAL_DEV(use_protocol, use_transport) \ { \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ } static const struct us_unusual_dev us_unusual_dev_list[] = { # include "unusual_devs.h" { } /* Terminating entry */ }; static const struct us_unusual_dev for_dynamic_ids = USUAL_DEV(USB_SC_SCSI, USB_PR_BULK); #undef UNUSUAL_DEV #undef COMPLIANT_DEV #undef USUAL_DEV #ifdef CONFIG_LOCKDEP static struct lock_class_key us_interface_key[USB_MAXINTERFACES]; static void us_set_lock_class(struct mutex *mutex, struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct usb_host_config *config = udev->actconfig; int i; for (i = 0; i < config->desc.bNumInterfaces; i++) { if (config->interface[i] == intf) break; } BUG_ON(i == config->desc.bNumInterfaces); lockdep_set_class(mutex, &us_interface_key[i]); } #else static void us_set_lock_class(struct mutex *mutex, struct usb_interface *intf) { } #endif #ifdef CONFIG_PM /* Minimal support for suspend and resume */ int usb_stor_suspend(struct usb_interface *iface, pm_message_t message) { struct us_data *us = usb_get_intfdata(iface); /* Wait until no command is running */ mutex_lock(&us->dev_mutex); if (us->suspend_resume_hook) (us->suspend_resume_hook)(us, US_SUSPEND); /* * When runtime PM is working, we'll set a flag to indicate * whether we should autoresume when a SCSI request arrives. */ mutex_unlock(&us->dev_mutex); return 0; } EXPORT_SYMBOL_GPL(usb_stor_suspend); int usb_stor_resume(struct usb_interface *iface) { struct us_data *us = usb_get_intfdata(iface); mutex_lock(&us->dev_mutex); if (us->suspend_resume_hook) (us->suspend_resume_hook)(us, US_RESUME); mutex_unlock(&us->dev_mutex); return 0; } EXPORT_SYMBOL_GPL(usb_stor_resume); int usb_stor_reset_resume(struct usb_interface *iface) { struct us_data *us = usb_get_intfdata(iface); /* Report the reset to the SCSI core */ usb_stor_report_bus_reset(us); /* * If any of the subdrivers implemented a reinitialization scheme, * this is where the callback would be invoked. */ return 0; } EXPORT_SYMBOL_GPL(usb_stor_reset_resume); #endif /* CONFIG_PM */ /* * The next two routines get called just before and just after * a USB port reset, whether from this driver or a different one. */ int usb_stor_pre_reset(struct usb_interface *iface) { struct us_data *us = usb_get_intfdata(iface); /* Make sure no command runs during the reset */ mutex_lock(&us->dev_mutex); return 0; } EXPORT_SYMBOL_GPL(usb_stor_pre_reset); int usb_stor_post_reset(struct usb_interface *iface) { struct us_data *us = usb_get_intfdata(iface); /* Report the reset to the SCSI core */ usb_stor_report_bus_reset(us); /* * If any of the subdrivers implemented a reinitialization scheme, * this is where the callback would be invoked. */ mutex_unlock(&us->dev_mutex); return 0; } EXPORT_SYMBOL_GPL(usb_stor_post_reset); /* * fill_inquiry_response takes an unsigned char array (which must * be at least 36 characters) and populates the vendor name, * product name, and revision fields. Then the array is copied * into the SCSI command's response buffer (oddly enough * called request_buffer). data_len contains the length of the * data array, which again must be at least 36. */ void fill_inquiry_response(struct us_data *us, unsigned char *data, unsigned int data_len) { if (data_len < 36) /* You lose. */ return; memset(data+8, ' ', 28); if (data[0]&0x20) { /* * USB device currently not connected. Return * peripheral qualifier 001b ("...however, the * physical device is not currently connected * to this logical unit") and leave vendor and * product identification empty. ("If the target * does store some of the INQUIRY data on the * device, it may return zeros or ASCII spaces * (20h) in those fields until the data is * available from the device."). */ } else { u16 bcdDevice = le16_to_cpu(us->pusb_dev->descriptor.bcdDevice); int n; n = strlen(us->unusual_dev->vendorName); memcpy(data+8, us->unusual_dev->vendorName, min(8, n)); n = strlen(us->unusual_dev->productName); memcpy(data+16, us->unusual_dev->productName, min(16, n)); data[32] = 0x30 + ((bcdDevice>>12) & 0x0F); data[33] = 0x30 + ((bcdDevice>>8) & 0x0F); data[34] = 0x30 + ((bcdDevice>>4) & 0x0F); data[35] = 0x30 + ((bcdDevice) & 0x0F); } usb_stor_set_xfer_buf(data, data_len, us->srb); } EXPORT_SYMBOL_GPL(fill_inquiry_response); static int usb_stor_control_thread(void * __us) { struct us_data *us = (struct us_data *)__us; struct Scsi_Host *host = us_to_host(us); struct scsi_cmnd *srb; for (;;) { usb_stor_dbg(us, "*** thread sleeping\n"); if (wait_for_completion_interruptible(&us->cmnd_ready)) break; usb_stor_dbg(us, "*** thread awakened\n"); /* lock the device pointers */ mutex_lock(&(us->dev_mutex)); /* lock access to the state */ scsi_lock(host); /* When we are called with no command pending, we're done */ srb = us->srb; if (srb == NULL) { scsi_unlock(host); mutex_unlock(&us->dev_mutex); usb_stor_dbg(us, "-- exiting\n"); break; } /* has the command timed out *already* ? */ if (test_bit(US_FLIDX_TIMED_OUT, &us->dflags)) { srb->result = DID_ABORT << 16; goto SkipForAbort; } scsi_unlock(host); /* * reject the command if the direction indicator * is UNKNOWN */ if (srb->sc_data_direction == DMA_BIDIRECTIONAL) { usb_stor_dbg(us, "UNKNOWN data direction\n"); srb->result = DID_ERROR << 16; } /* * reject if target != 0 or if LUN is higher than * the maximum known LUN */ else if (srb->device->id && !(us->fflags & US_FL_SCM_MULT_TARG)) { usb_stor_dbg(us, "Bad target number (%d:%llu)\n", srb->device->id, srb->device->lun); srb->result = DID_BAD_TARGET << 16; } else if (srb->device->lun > us->max_lun) { usb_stor_dbg(us, "Bad LUN (%d:%llu)\n", srb->device->id, srb->device->lun); srb->result = DID_BAD_TARGET << 16; } /* * Handle those devices which need us to fake * their inquiry data */ else if ((srb->cmnd[0] == INQUIRY) && (us->fflags & US_FL_FIX_INQUIRY)) { unsigned char data_ptr[36] = { 0x00, 0x80, 0x02, 0x02, 0x1F, 0x00, 0x00, 0x00}; usb_stor_dbg(us, "Faking INQUIRY command\n"); fill_inquiry_response(us, data_ptr, 36); srb->result = SAM_STAT_GOOD; } /* we've got a command, let's do it! */ else { US_DEBUG(usb_stor_show_command(us, srb)); us->proto_handler(srb, us); usb_mark_last_busy(us->pusb_dev); } /* lock access to the state */ scsi_lock(host); /* was the command aborted? */ if (srb->result == DID_ABORT << 16) { SkipForAbort: usb_stor_dbg(us, "scsi command aborted\n"); srb = NULL; /* Don't call scsi_done() */ } /* * If an abort request was received we need to signal that * the abort has finished. The proper test for this is * the TIMED_OUT flag, not srb->result == DID_ABORT, because * the timeout might have occurred after the command had * already completed with a different result code. */ if (test_bit(US_FLIDX_TIMED_OUT, &us->dflags)) { complete(&(us->notify)); /* Allow USB transfers to resume */ clear_bit(US_FLIDX_ABORTING, &us->dflags); clear_bit(US_FLIDX_TIMED_OUT, &us->dflags); } /* finished working on this command */ us->srb = NULL; scsi_unlock(host); /* unlock the device pointers */ mutex_unlock(&us->dev_mutex); /* now that the locks are released, notify the SCSI core */ if (srb) { usb_stor_dbg(us, "scsi cmd done, result=0x%x\n", srb->result); scsi_done_direct(srb); } } /* for (;;) */ /* Wait until we are told to stop */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) break; schedule(); } __set_current_state(TASK_RUNNING); return 0; } /*********************************************************************** * Device probing and disconnecting ***********************************************************************/ /* Associate our private data with the USB device */ static int associate_dev(struct us_data *us, struct usb_interface *intf) { /* Fill in the device-related fields */ us->pusb_dev = interface_to_usbdev(intf); us->pusb_intf = intf; us->ifnum = intf->cur_altsetting->desc.bInterfaceNumber; usb_stor_dbg(us, "Vendor: 0x%04x, Product: 0x%04x, Revision: 0x%04x\n", le16_to_cpu(us->pusb_dev->descriptor.idVendor), le16_to_cpu(us->pusb_dev->descriptor.idProduct), le16_to_cpu(us->pusb_dev->descriptor.bcdDevice)); usb_stor_dbg(us, "Interface Subclass: 0x%02x, Protocol: 0x%02x\n", intf->cur_altsetting->desc.bInterfaceSubClass, intf->cur_altsetting->desc.bInterfaceProtocol); /* Store our private data in the interface */ usb_set_intfdata(intf, us); /* Allocate the control/setup and DMA-mapped buffers */ us->cr = kmalloc(sizeof(*us->cr), GFP_KERNEL); if (!us->cr) return -ENOMEM; us->iobuf = usb_alloc_coherent(us->pusb_dev, US_IOBUF_SIZE, GFP_KERNEL, &us->iobuf_dma); if (!us->iobuf) { usb_stor_dbg(us, "I/O buffer allocation failed\n"); return -ENOMEM; } return 0; } /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) /* Adjust device flags based on the "quirks=" module parameter */ void usb_stor_adjust_quirks(struct usb_device *udev, u64 *fflags) { char *p; u16 vid = le16_to_cpu(udev->descriptor.idVendor); u16 pid = le16_to_cpu(udev->descriptor.idProduct); u64 f = 0; u64 mask = (US_FL_SANE_SENSE | US_FL_BAD_SENSE | US_FL_FIX_CAPACITY | US_FL_IGNORE_UAS | US_FL_CAPACITY_HEURISTICS | US_FL_IGNORE_DEVICE | US_FL_NOT_LOCKABLE | US_FL_MAX_SECTORS_64 | US_FL_CAPACITY_OK | US_FL_IGNORE_RESIDUE | US_FL_SINGLE_LUN | US_FL_NO_WP_DETECT | US_FL_NO_READ_DISC_INFO | US_FL_NO_READ_CAPACITY_16 | US_FL_INITIAL_READ10 | US_FL_WRITE_CACHE | US_FL_NO_ATA_1X | US_FL_NO_REPORT_OPCODES | US_FL_MAX_SECTORS_240 | US_FL_NO_REPORT_LUNS | US_FL_ALWAYS_SYNC); p = quirks; while (*p) { /* Each entry consists of VID:PID:flags */ if (vid == simple_strtoul(p, &p, 16) && *p == ':' && pid == simple_strtoul(p+1, &p, 16) && *p == ':') break; /* Move forward to the next entry */ while (*p) { if (*p++ == ',') break; } } if (!*p) /* No match */ return; /* Collect the flags */ while (*++p && *p != ',') { switch (TOLOWER(*p)) { case 'a': f |= US_FL_SANE_SENSE; break; case 'b': f |= US_FL_BAD_SENSE; break; case 'c': f |= US_FL_FIX_CAPACITY; break; case 'd': f |= US_FL_NO_READ_DISC_INFO; break; case 'e': f |= US_FL_NO_READ_CAPACITY_16; break; case 'f': f |= US_FL_NO_REPORT_OPCODES; break; case 'g': f |= US_FL_MAX_SECTORS_240; break; case 'h': f |= US_FL_CAPACITY_HEURISTICS; break; case 'i': f |= US_FL_IGNORE_DEVICE; break; case 'j': f |= US_FL_NO_REPORT_LUNS; break; case 'k': f |= US_FL_NO_SAME; break; case 'l': f |= US_FL_NOT_LOCKABLE; break; case 'm': f |= US_FL_MAX_SECTORS_64; break; case 'n': f |= US_FL_INITIAL_READ10; break; case 'o': f |= US_FL_CAPACITY_OK; break; case 'p': f |= US_FL_WRITE_CACHE; break; case 'r': f |= US_FL_IGNORE_RESIDUE; break; case 's': f |= US_FL_SINGLE_LUN; break; case 't': f |= US_FL_NO_ATA_1X; break; case 'u': f |= US_FL_IGNORE_UAS; break; case 'w': f |= US_FL_NO_WP_DETECT; break; case 'y': f |= US_FL_ALWAYS_SYNC; break; /* Ignore unrecognized flag characters */ } } *fflags = (*fflags & ~mask) | f; } EXPORT_SYMBOL_GPL(usb_stor_adjust_quirks); /* Get the unusual_devs entries and the string descriptors */ static int get_device_info(struct us_data *us, const struct usb_device_id *id, const struct us_unusual_dev *unusual_dev) { struct usb_device *dev = us->pusb_dev; struct usb_interface_descriptor *idesc = &us->pusb_intf->cur_altsetting->desc; struct device *pdev = &us->pusb_intf->dev; /* Store the entries */ us->unusual_dev = unusual_dev; us->subclass = (unusual_dev->useProtocol == USB_SC_DEVICE) ? idesc->bInterfaceSubClass : unusual_dev->useProtocol; us->protocol = (unusual_dev->useTransport == USB_PR_DEVICE) ? idesc->bInterfaceProtocol : unusual_dev->useTransport; us->fflags = id->driver_info; usb_stor_adjust_quirks(us->pusb_dev, &us->fflags); if (us->fflags & US_FL_IGNORE_DEVICE) { dev_info(pdev, "device ignored\n"); return -ENODEV; } /* * This flag is only needed when we're in high-speed, so let's * disable it if we're in full-speed */ if (dev->speed != USB_SPEED_HIGH) us->fflags &= ~US_FL_GO_SLOW; if (us->fflags) dev_info(pdev, "Quirks match for vid %04x pid %04x: %llx\n", le16_to_cpu(dev->descriptor.idVendor), le16_to_cpu(dev->descriptor.idProduct), us->fflags); /* * Log a message if a non-generic unusual_dev entry contains an * unnecessary subclass or protocol override. This may stimulate * reports from users that will help us remove unneeded entries * from the unusual_devs.h table. */ if (id->idVendor || id->idProduct) { static const char *msgs[3] = { "an unneeded SubClass entry", "an unneeded Protocol entry", "unneeded SubClass and Protocol entries"}; struct usb_device_descriptor *ddesc = &dev->descriptor; int msg = -1; if (unusual_dev->useProtocol != USB_SC_DEVICE && us->subclass == idesc->bInterfaceSubClass) msg += 1; if (unusual_dev->useTransport != USB_PR_DEVICE && us->protocol == idesc->bInterfaceProtocol) msg += 2; if (msg >= 0 && !(us->fflags & US_FL_NEED_OVERRIDE)) dev_notice(pdev, "This device " "(%04x,%04x,%04x S %02x P %02x)" " has %s in unusual_devs.h (kernel" " %s)\n" " Please send a copy of this message to " "<linux-usb@vger.kernel.org> and " "<usb-storage@lists.one-eyed-alien.net>\n", le16_to_cpu(ddesc->idVendor), le16_to_cpu(ddesc->idProduct), le16_to_cpu(ddesc->bcdDevice), idesc->bInterfaceSubClass, idesc->bInterfaceProtocol, msgs[msg], utsname()->release); } return 0; } /* Get the transport settings */ static void get_transport(struct us_data *us) { switch (us->protocol) { case USB_PR_CB: us->transport_name = "Control/Bulk"; us->transport = usb_stor_CB_transport; us->transport_reset = usb_stor_CB_reset; us->max_lun = 7; break; case USB_PR_CBI: us->transport_name = "Control/Bulk/Interrupt"; us->transport = usb_stor_CB_transport; us->transport_reset = usb_stor_CB_reset; us->max_lun = 7; break; case USB_PR_BULK: us->transport_name = "Bulk"; us->transport = usb_stor_Bulk_transport; us->transport_reset = usb_stor_Bulk_reset; break; } } /* Get the protocol settings */ static void get_protocol(struct us_data *us) { switch (us->subclass) { case USB_SC_RBC: us->protocol_name = "Reduced Block Commands (RBC)"; us->proto_handler = usb_stor_transparent_scsi_command; break; case USB_SC_8020: us->protocol_name = "8020i"; us->proto_handler = usb_stor_pad12_command; us->max_lun = 0; break; case USB_SC_QIC: us->protocol_name = "QIC-157"; us->proto_handler = usb_stor_pad12_command; us->max_lun = 0; break; case USB_SC_8070: us->protocol_name = "8070i"; us->proto_handler = usb_stor_pad12_command; us->max_lun = 0; break; case USB_SC_SCSI: us->protocol_name = "Transparent SCSI"; us->proto_handler = usb_stor_transparent_scsi_command; break; case USB_SC_UFI: us->protocol_name = "Uniform Floppy Interface (UFI)"; us->proto_handler = usb_stor_ufi_command; break; } } /* Get the pipe settings */ static int get_pipes(struct us_data *us) { struct usb_host_interface *alt = us->pusb_intf->cur_altsetting; struct usb_endpoint_descriptor *ep_in; struct usb_endpoint_descriptor *ep_out; struct usb_endpoint_descriptor *ep_int; int res; /* * Find the first endpoint of each type we need. * We are expecting a minimum of 2 endpoints - in and out (bulk). * An optional interrupt-in is OK (necessary for CBI protocol). * We will ignore any others. */ res = usb_find_common_endpoints(alt, &ep_in, &ep_out, NULL, NULL); if (res) { usb_stor_dbg(us, "bulk endpoints not found\n"); return res; } res = usb_find_int_in_endpoint(alt, &ep_int); if (res && us->protocol == USB_PR_CBI) { usb_stor_dbg(us, "interrupt endpoint not found\n"); return res; } /* Calculate and store the pipe values */ us->send_ctrl_pipe = usb_sndctrlpipe(us->pusb_dev, 0); us->recv_ctrl_pipe = usb_rcvctrlpipe(us->pusb_dev, 0); us->send_bulk_pipe = usb_sndbulkpipe(us->pusb_dev, usb_endpoint_num(ep_out)); us->recv_bulk_pipe = usb_rcvbulkpipe(us->pusb_dev, usb_endpoint_num(ep_in)); if (ep_int) { us->recv_intr_pipe = usb_rcvintpipe(us->pusb_dev, usb_endpoint_num(ep_int)); us->ep_bInterval = ep_int->bInterval; } return 0; } /* Initialize all the dynamic resources we need */ static int usb_stor_acquire_resources(struct us_data *us) { int p; struct task_struct *th; us->current_urb = usb_alloc_urb(0, GFP_KERNEL); if (!us->current_urb) return -ENOMEM; /* * Just before we start our control thread, initialize * the device if it needs initialization */ if (us->unusual_dev->initFunction) { p = us->unusual_dev->initFunction(us); if (p) return p; } /* Start up our control thread */ th = kthread_run(usb_stor_control_thread, us, "usb-storage"); if (IS_ERR(th)) { dev_warn(&us->pusb_intf->dev, "Unable to start control thread\n"); return PTR_ERR(th); } us->ctl_thread = th; return 0; } /* Release all our dynamic resources */ static void usb_stor_release_resources(struct us_data *us) { /* * Tell the control thread to exit. The SCSI host must * already have been removed and the DISCONNECTING flag set * so that we won't accept any more commands. */ usb_stor_dbg(us, "-- sending exit command to thread\n"); complete(&us->cmnd_ready); if (us->ctl_thread) kthread_stop(us->ctl_thread); /* Call the destructor routine, if it exists */ if (us->extra_destructor) { usb_stor_dbg(us, "-- calling extra_destructor()\n"); us->extra_destructor(us->extra); } /* Free the extra data and the URB */ kfree(us->extra); usb_free_urb(us->current_urb); } /* Dissociate from the USB device */ static void dissociate_dev(struct us_data *us) { /* Free the buffers */ kfree(us->cr); usb_free_coherent(us->pusb_dev, US_IOBUF_SIZE, us->iobuf, us->iobuf_dma); /* Remove our private data from the interface */ usb_set_intfdata(us->pusb_intf, NULL); } /* * First stage of disconnect processing: stop SCSI scanning, * remove the host, and stop accepting new commands */ static void quiesce_and_remove_host(struct us_data *us) { struct Scsi_Host *host = us_to_host(us); /* If the device is really gone, cut short reset delays */ if (us->pusb_dev->state == USB_STATE_NOTATTACHED) { set_bit(US_FLIDX_DISCONNECTING, &us->dflags); wake_up(&us->delay_wait); } /* * Prevent SCSI scanning (if it hasn't started yet) * or wait for the SCSI-scanning routine to stop. */ cancel_delayed_work_sync(&us->scan_dwork); /* Balance autopm calls if scanning was cancelled */ if (test_bit(US_FLIDX_SCAN_PENDING, &us->dflags)) usb_autopm_put_interface_no_suspend(us->pusb_intf); /* * Removing the host will perform an orderly shutdown: caches * synchronized, disks spun down, etc. */ scsi_remove_host(host); /* * Prevent any new commands from being accepted and cut short * reset delays. */ scsi_lock(host); set_bit(US_FLIDX_DISCONNECTING, &us->dflags); scsi_unlock(host); wake_up(&us->delay_wait); } /* Second stage of disconnect processing: deallocate all resources */ static void release_everything(struct us_data *us) { usb_stor_release_resources(us); dissociate_dev(us); /* * Drop our reference to the host; the SCSI core will free it * (and "us" along with it) when the refcount becomes 0. */ scsi_host_put(us_to_host(us)); } /* Delayed-work routine to carry out SCSI-device scanning */ static void usb_stor_scan_dwork(struct work_struct *work) { struct us_data *us = container_of(work, struct us_data, scan_dwork.work); struct device *dev = &us->pusb_intf->dev; dev_dbg(dev, "starting scan\n"); /* For bulk-only devices, determine the max LUN value */ if (us->protocol == USB_PR_BULK && !(us->fflags & US_FL_SINGLE_LUN) && !(us->fflags & US_FL_SCM_MULT_TARG)) { mutex_lock(&us->dev_mutex); us->max_lun = usb_stor_Bulk_max_lun(us); /* * Allow proper scanning of devices that present more than 8 LUNs * While not affecting other devices that may need the previous * behavior */ if (us->max_lun >= 8) us_to_host(us)->max_lun = us->max_lun+1; mutex_unlock(&us->dev_mutex); } scsi_scan_host(us_to_host(us)); dev_dbg(dev, "scan complete\n"); /* Should we unbind if no devices were detected? */ usb_autopm_put_interface(us->pusb_intf); clear_bit(US_FLIDX_SCAN_PENDING, &us->dflags); } static unsigned int usb_stor_sg_tablesize(struct usb_interface *intf) { struct usb_device *usb_dev = interface_to_usbdev(intf); if (usb_dev->bus->sg_tablesize) { return usb_dev->bus->sg_tablesize; } return SG_ALL; } /* First part of general USB mass-storage probing */ int usb_stor_probe1(struct us_data **pus, struct usb_interface *intf, const struct usb_device_id *id, const struct us_unusual_dev *unusual_dev, const struct scsi_host_template *sht) { struct Scsi_Host *host; struct us_data *us; int result; dev_info(&intf->dev, "USB Mass Storage device detected\n"); /* * Ask the SCSI layer to allocate a host structure, with extra * space at the end for our private us_data structure. */ host = scsi_host_alloc(sht, sizeof(*us)); if (!host) { dev_warn(&intf->dev, "Unable to allocate the scsi host\n"); return -ENOMEM; } /* * Allow 16-byte CDBs and thus > 2TB */ host->max_cmd_len = 16; host->sg_tablesize = usb_stor_sg_tablesize(intf); *pus = us = host_to_us(host); mutex_init(&(us->dev_mutex)); us_set_lock_class(&us->dev_mutex, intf); init_completion(&us->cmnd_ready); init_completion(&(us->notify)); init_waitqueue_head(&us->delay_wait); INIT_DELAYED_WORK(&us->scan_dwork, usb_stor_scan_dwork); /* Associate the us_data structure with the USB device */ result = associate_dev(us, intf); if (result) goto BadDevice; /* Get the unusual_devs entries and the descriptors */ result = get_device_info(us, id, unusual_dev); if (result) goto BadDevice; /* Get standard transport and protocol settings */ get_transport(us); get_protocol(us); /* * Give the caller a chance to fill in specialized transport * or protocol settings. */ return 0; BadDevice: usb_stor_dbg(us, "storage_probe() failed\n"); release_everything(us); return result; } EXPORT_SYMBOL_GPL(usb_stor_probe1); /* Second part of general USB mass-storage probing */ int usb_stor_probe2(struct us_data *us) { int result; struct device *dev = &us->pusb_intf->dev; /* Make sure the transport and protocol have both been set */ if (!us->transport || !us->proto_handler) { result = -ENXIO; goto BadDevice; } usb_stor_dbg(us, "Transport: %s\n", us->transport_name); usb_stor_dbg(us, "Protocol: %s\n", us->protocol_name); if (us->fflags & US_FL_SCM_MULT_TARG) { /* * SCM eUSCSI bridge devices can have different numbers * of LUNs on different targets; allow all to be probed. */ us->max_lun = 7; /* The eUSCSI itself has ID 7, so avoid scanning that */ us_to_host(us)->this_id = 7; /* max_id is 8 initially, so no need to set it here */ } else { /* In the normal case there is only a single target */ us_to_host(us)->max_id = 1; /* * Like Windows, we won't store the LUN bits in CDB[1] for * SCSI-2 devices using the Bulk-Only transport (even though * this violates the SCSI spec). */ if (us->transport == usb_stor_Bulk_transport) us_to_host(us)->no_scsi2_lun_in_cdb = 1; } /* fix for single-lun devices */ if (us->fflags & US_FL_SINGLE_LUN) us->max_lun = 0; /* Find the endpoints and calculate pipe values */ result = get_pipes(us); if (result) goto BadDevice; /* * If the device returns invalid data for the first READ(10) * command, indicate the command should be retried. */ if (us->fflags & US_FL_INITIAL_READ10) set_bit(US_FLIDX_REDO_READ10, &us->dflags); /* Acquire all the other resources and add the host */ result = usb_stor_acquire_resources(us); if (result) goto BadDevice; usb_autopm_get_interface_no_resume(us->pusb_intf); snprintf(us->scsi_name, sizeof(us->scsi_name), "usb-storage %s", dev_name(&us->pusb_intf->dev)); result = scsi_add_host(us_to_host(us), dev); if (result) { dev_warn(dev, "Unable to add the scsi host\n"); goto HostAddErr; } /* Submit the delayed_work for SCSI-device scanning */ set_bit(US_FLIDX_SCAN_PENDING, &us->dflags); if (delay_use > 0) dev_dbg(dev, "waiting for device to settle before scanning\n"); queue_delayed_work(system_freezable_wq, &us->scan_dwork, delay_use * HZ); return 0; /* We come here if there are any problems */ HostAddErr: usb_autopm_put_interface_no_suspend(us->pusb_intf); BadDevice: usb_stor_dbg(us, "storage_probe() failed\n"); release_everything(us); return result; } EXPORT_SYMBOL_GPL(usb_stor_probe2); /* Handle a USB mass-storage disconnect */ void usb_stor_disconnect(struct usb_interface *intf) { struct us_data *us = usb_get_intfdata(intf); quiesce_and_remove_host(us); release_everything(us); } EXPORT_SYMBOL_GPL(usb_stor_disconnect); static struct scsi_host_template usb_stor_host_template; /* The main probe routine for standard devices */ static int storage_probe(struct usb_interface *intf, const struct usb_device_id *id) { const struct us_unusual_dev *unusual_dev; struct us_data *us; int result; int size; /* If uas is enabled and this device can do uas then ignore it. */ #if IS_ENABLED(CONFIG_USB_UAS) if (uas_use_uas_driver(intf, id, NULL)) return -ENXIO; #endif /* * If the device isn't standard (is handled by a subdriver * module) then don't accept it. */ if (usb_usual_ignore_device(intf)) return -ENXIO; /* * Call the general probe procedures. * * The unusual_dev_list array is parallel to the usb_storage_usb_ids * table, so we use the index of the id entry to find the * corresponding unusual_devs entry. */ size = ARRAY_SIZE(us_unusual_dev_list); if (id >= usb_storage_usb_ids && id < usb_storage_usb_ids + size) { unusual_dev = (id - usb_storage_usb_ids) + us_unusual_dev_list; } else { unusual_dev = &for_dynamic_ids; dev_dbg(&intf->dev, "Use Bulk-Only transport with the Transparent SCSI protocol for dynamic id: 0x%04x 0x%04x\n", id->idVendor, id->idProduct); } result = usb_stor_probe1(&us, intf, id, unusual_dev, &usb_stor_host_template); if (result) return result; /* No special transport or protocol settings in the main module */ result = usb_stor_probe2(us); return result; } static struct usb_driver usb_storage_driver = { .name = DRV_NAME, .probe = storage_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = usb_storage_usb_ids, .supports_autosuspend = 1, .soft_unbind = 1, }; module_usb_stor_driver(usb_storage_driver, usb_stor_host_template, DRV_NAME); |
| 112 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H #ifndef __ASSEMBLY__ #include <asm/barrier.h> static inline void cpu_relax(void) { #ifdef __riscv_muldiv int dummy; /* In lieu of a halt instruction, induce a long-latency stall. */ __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy)); #endif #ifdef CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE /* * Reduce instruction retirement. * This assumes the PC changes. */ __asm__ __volatile__ ("pause"); #else /* Encoding of the pause instruction */ __asm__ __volatile__ (".4byte 0x100000F"); #endif barrier(); } #endif /* __ASSEMBLY__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ |
| 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CIPSO - Commercial IP Security Option * * This is an implementation of the CIPSO 2.2 protocol as specified in * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors * have chosen to adopt the protocol and over the years it has become a * de-facto standard for labeled networking. * * The CIPSO draft specification can be found in the kernel's Documentation * directory as well as the following URL: * https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt * The FIPS-188 specification can be found at the following URL: * https://www.itl.nist.gov/fipspubs/fip188.htm * * Author: Paul Moore <paul.moore@hp.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> #include <linux/bug.h> #include <asm/unaligned.h> /* List of available DOI definitions */ /* XXX - This currently assumes a minimal number of different DOIs in use, * if in practice there are a lot of different DOIs this list should * probably be turned into a hash table or something similar so we * can do quick lookups. */ static DEFINE_SPINLOCK(cipso_v4_doi_list_lock); static LIST_HEAD(cipso_v4_doi_list); /* Label mapping cache */ int cipso_v4_cache_enabled = 1; int cipso_v4_cache_bucketsize = 10; #define CIPSO_V4_CACHE_BUCKETBITS 7 #define CIPSO_V4_CACHE_BUCKETS (1 << CIPSO_V4_CACHE_BUCKETBITS) #define CIPSO_V4_CACHE_REORDERLIMIT 10 struct cipso_v4_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct cipso_v4_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ int cipso_v4_rbm_optfmt; int cipso_v4_rbm_strictvalid = 1; /* * Protocol Constants */ /* Maximum size of the CIPSO IP option, derived from the fact that the maximum * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ #define CIPSO_V4_OPT_LEN_MAX 40 /* Length of the base CIPSO option, this includes the option type (1 byte), the * option length (1 byte), and the DOI (4 bytes). */ #define CIPSO_V4_HDR_LEN 6 /* Base length of the restrictive category bitmap tag (tag #1). */ #define CIPSO_V4_TAG_RBM_BLEN 4 /* Base length of the enumerated category tag (tag #2). */ #define CIPSO_V4_TAG_ENUM_BLEN 4 /* Base length of the ranged categories bitmap tag (tag #5). */ #define CIPSO_V4_TAG_RNG_BLEN 4 /* The maximum number of category ranges permitted in the ranged category tag * (tag #5). You may note that the IETF draft states that the maximum number * of category ranges is 7, but if the low end of the last category range is * zero then it is possible to fit 8 category ranges because the zero should * be omitted. */ #define CIPSO_V4_TAG_RNG_CAT_MAX 8 /* Base length of the local tag (non-standard tag). * Tag definition (may change between kernel versions) * * 0 8 16 24 32 * +----------+----------+----------+----------+ * | 10000000 | 00000110 | 32-bit secid value | * +----------+----------+----------+----------+ * | in (host byte order)| * +----------+----------+ * */ #define CIPSO_V4_TAG_LOC_BLEN 6 /* * Helper Functions */ /** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /* * Label Mapping Cache Functions */ /** * cipso_v4_cache_init - Initialize the CIPSO cache * * Description: * Initializes the CIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init cipso_v4_cache_init(void) { u32 iter; cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS, sizeof(struct cipso_v4_map_cache_bkt), GFP_KERNEL); if (!cipso_v4_cache) return -ENOMEM; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { spin_lock_init(&cipso_v4_cache[iter].lock); cipso_v4_cache[iter].size = 0; INIT_LIST_HEAD(&cipso_v4_cache[iter].list); } return 0; } /** * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache * * Description: * Invalidates and frees any entries in the CIPSO cache. * */ void cipso_v4_cache_invalidate(void) { struct cipso_v4_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { spin_lock_bh(&cipso_v4_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &cipso_v4_cache[iter].list, list) { list_del(&entry->list); cipso_v4_cache_entry_free(entry); } cipso_v4_cache[iter].size = 0; spin_unlock_bh(&cipso_v4_cache[iter].lock); } } /** * cipso_v4_cache_check - Check the CIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int cipso_v4_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct cipso_v4_map_cache_entry *entry; struct cipso_v4_map_cache_entry *prev_entry = NULL; u32 hash; if (!READ_ONCE(cipso_v4_cache_enabled)) return -ENOENT; hash = cipso_v4_map_cache_hash(key, key_len); bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CIPSOV4; if (!prev_entry) { spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CIPSO_V4_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return -ENOENT; } /** * cipso_v4_cache_add - Add an entry to the CIPSO cache * @cipso_ptr: pointer to CIPSO IP option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. * */ int cipso_v4_cache_add(const unsigned char *cipso_ptr, const struct netlbl_lsm_secattr *secattr) { int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize); int ret_val = -EPERM; u32 bkt; struct cipso_v4_map_cache_entry *entry = NULL; struct cipso_v4_map_cache_entry *old_entry = NULL; u32 cipso_ptr_len; if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0) return 0; cipso_ptr_len = cipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = cipso_ptr_len; entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); if (cipso_v4_cache[bkt].size < bkt_size) { list_add(&entry->list, &cipso_v4_cache[bkt].list); cipso_v4_cache[bkt].size += 1; } else { old_entry = list_entry(cipso_v4_cache[bkt].list.prev, struct cipso_v4_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &cipso_v4_cache[bkt].list); cipso_v4_cache_entry_free(old_entry); } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; cache_add_failure: if (entry) cipso_v4_cache_entry_free(entry); return ret_val; } /* * DOI List Functions */ /** * cipso_v4_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) { struct cipso_v4_doi *iter; list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see cipso_ipv4.h for details). Returns * zero on success and non-zero on failure. * */ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 iter; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN) goto doi_add_return; for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { switch (doi_def->tags[iter]) { case CIPSO_V4_TAG_RBITMAP: break; case CIPSO_V4_TAG_RANGE: case CIPSO_V4_TAG_ENUM: if (doi_def->type != CIPSO_V4_MAP_PASS) goto doi_add_return; break; case CIPSO_V4_TAG_LOCAL: if (doi_def->type != CIPSO_V4_MAP_LOCAL) goto doi_add_return; break; case CIPSO_V4_TAG_INVALID: if (iter == 0) goto doi_add_return; break; default: goto doi_add_return; } } refcount_set(&doi_def->refcount, 1); spin_lock(&cipso_v4_doi_list_lock); if (cipso_v4_doi_search(doi_def->doi)) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list); spin_unlock(&cipso_v4_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CIPSO_V4_MAP_TRANS: type_str = "trans"; break; case CIPSO_V4_MAP_PASS: type_str = "pass"; break; case CIPSO_V4_MAP_LOCAL: type_str = "local"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " cipso_doi=%u cipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * cipso_v4_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) { if (!doi_def) return; switch (doi_def->type) { case CIPSO_V4_MAP_TRANS: kfree(doi_def->map.std->lvl.cipso); kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); kfree(doi_def->map.std); break; } kfree(doi_def); } /** * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void cipso_v4_doi_free_rcu(struct rcu_head *entry) { struct cipso_v4_doi *doi_def; doi_def = container_of(entry, struct cipso_v4_doi, rcu); cipso_v4_doi_free(doi_def); } /** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct cipso_v4_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&cipso_v4_doi_list_lock); doi_def = cipso_v4_doi_search(doi); if (!doi_def) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&cipso_v4_doi_list_lock); cipso_v4_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " cipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * rcu_read_lock() is held while accessing the returned definition and the DOI * definition reference count is decremented when the caller is done. * */ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi) { struct cipso_v4_doi *doi_def; rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * cipso_v4_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from cipso_v4_doi_getdef(). * */ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; cipso_v4_cache_invalidate(); call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); } /** * cipso_v4_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ int cipso_v4_doi_walk(u32 *skip_cnt, int (*callback) (struct cipso_v4_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct cipso_v4_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /* * Label Mapping Functions */ /** * cipso_v4_map_lvl_valid - Checks to see if the given level is understood * @doi_def: the DOI definition * @level: the level to check * * Description: * Checks the given level against the given DOI definition and returns a * negative value if the level does not have a valid mapping and a zero value * if the level is defined by the DOI. * */ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) { switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: if ((level < doi_def->map.std->lvl.cipso_size) && (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)) return 0; break; } return -EFAULT; } /** * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network * @doi_def: the DOI definition * @host_lvl: the host MLS level * @net_lvl: the network/CIPSO MLS level * * Description: * Perform a label mapping to translate a local MLS level to the correct * CIPSO level using the given DOI definition. Returns zero on success, * negative values otherwise. * */ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def, u32 host_lvl, u32 *net_lvl) { switch (doi_def->type) { case CIPSO_V4_MAP_PASS: *net_lvl = host_lvl; return 0; case CIPSO_V4_MAP_TRANS: if (host_lvl < doi_def->map.std->lvl.local_size && doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) { *net_lvl = doi_def->map.std->lvl.local[host_lvl]; return 0; } return -EPERM; } return -EINVAL; } /** * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host * @doi_def: the DOI definition * @net_lvl: the network/CIPSO MLS level * @host_lvl: the host MLS level * * Description: * Perform a label mapping to translate a CIPSO level to the correct local MLS * level using the given DOI definition. Returns zero on success, negative * values otherwise. * */ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def, u32 net_lvl, u32 *host_lvl) { struct cipso_v4_std_map_tbl *map_tbl; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: *host_lvl = net_lvl; return 0; case CIPSO_V4_MAP_TRANS: map_tbl = doi_def->map.std; if (net_lvl < map_tbl->lvl.cipso_size && map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) { *host_lvl = doi_def->map.std->lvl.cipso[net_lvl]; return 0; } return -EPERM; } return -EINVAL; } /** * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid * @doi_def: the DOI definition * @bitmap: category bitmap * @bitmap_len: bitmap length in bytes * * Description: * Checks the given category bitmap against the given DOI definition and * returns a negative value if any of the categories in the bitmap do not have * a valid mapping and a zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, const unsigned char *bitmap, u32 bitmap_len) { int cat = -1; u32 bitmap_len_bits = bitmap_len * 8; u32 cipso_cat_size; u32 *cipso_array; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { cat = netlbl_bitmap_walk(bitmap, bitmap_len_bits, cat + 1, 1); if (cat < 0) break; if (cat >= cipso_cat_size || cipso_array[cat] >= CIPSO_V4_INV_CAT) return -EFAULT; } if (cat == -1) return 0; break; } return -EFAULT; } /** * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int host_spot = -1; u32 net_spot = CIPSO_V4_INV_CAT; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; u32 host_cat_size = 0; u32 *host_cat_array = NULL; if (doi_def->type == CIPSO_V4_MAP_TRANS) { host_cat_size = doi_def->map.std->cat.local_size; host_cat_array = doi_def->map.std->cat.local; } for (;;) { host_spot = netlbl_catmap_walk(secattr->attr.mls.cat, host_spot + 1); if (host_spot < 0) break; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: net_spot = host_spot; break; case CIPSO_V4_MAP_TRANS: if (host_spot >= host_cat_size) return -EPERM; net_spot = host_cat_array[host_spot]; if (net_spot >= CIPSO_V4_INV_CAT) return -EPERM; break; } if (net_spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, net_spot, 1); if (net_spot > net_spot_max) net_spot_max = net_spot; } if (++net_spot_max % 8) return net_spot_max / 8 + 1; return net_spot_max / 8; } /** * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int net_spot = -1; u32 host_spot = CIPSO_V4_INV_CAT; u32 net_clen_bits = net_cat_len * 8; u32 net_cat_size = 0; u32 *net_cat_array = NULL; if (doi_def->type == CIPSO_V4_MAP_TRANS) { net_cat_size = doi_def->map.std->cat.cipso_size; net_cat_array = doi_def->map.std->cat.cipso; } for (;;) { net_spot = netlbl_bitmap_walk(net_cat, net_clen_bits, net_spot + 1, 1); if (net_spot < 0) { if (net_spot == -2) return -EFAULT; return 0; } switch (doi_def->type) { case CIPSO_V4_MAP_PASS: host_spot = net_spot; break; case CIPSO_V4_MAP_TRANS: if (net_spot >= net_cat_size) return -EPERM; host_spot = net_cat_array[net_spot]; if (host_spot >= CIPSO_V4_INV_CAT) return -EPERM; break; } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, host_spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid * @doi_def: the DOI definition * @enumcat: category list * @enumcat_len: length of the category list in bytes * * Description: * Checks the given categories against the given DOI definition and returns a * negative value if any of the categories do not have a valid mapping and a * zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def, const unsigned char *enumcat, u32 enumcat_len) { u16 cat; int cat_prev = -1; u32 iter; if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01) return -EFAULT; for (iter = 0; iter < enumcat_len; iter += 2) { cat = get_unaligned_be16(&enumcat[iter]); if (cat <= cat_prev) return -EFAULT; cat_prev = cat; } return 0; } /** * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category list in network/CIPSO format * @net_cat_len: the length of the CIPSO category list in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO category list using the given DOI definition. Returns the * size in bytes of the network category bitmap on success, negative values * otherwise. * */ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int cat = -1; u32 cat_iter = 0; for (;;) { cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1); if (cat < 0) break; if ((cat_iter + 2) > net_cat_len) return -ENOSPC; *((__be16 *)&net_cat[cat_iter]) = htons(cat); cat_iter += 2; } return cat_iter; } /** * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category list in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO category list to the correct * local MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; u32 iter; for (iter = 0; iter < net_cat_len; iter += 2) { ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, get_unaligned_be16(&net_cat[iter]), GFP_ATOMIC); if (ret_val != 0) return ret_val; } return 0; } /** * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid * @doi_def: the DOI definition * @rngcat: category list * @rngcat_len: length of the category list in bytes * * Description: * Checks the given categories against the given DOI definition and returns a * negative value if any of the categories do not have a valid mapping and a * zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def, const unsigned char *rngcat, u32 rngcat_len) { u16 cat_high; u16 cat_low; u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1; u32 iter; if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01) return -EFAULT; for (iter = 0; iter < rngcat_len; iter += 4) { cat_high = get_unaligned_be16(&rngcat[iter]); if ((iter + 4) <= rngcat_len) cat_low = get_unaligned_be16(&rngcat[iter + 2]); else cat_low = 0; if (cat_high > cat_prev) return -EFAULT; cat_prev = cat_low; } return 0; } /** * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category list in network/CIPSO format * @net_cat_len: the length of the CIPSO category list in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO category list using the given DOI definition. Returns the * size in bytes of the network category bitmap on success, negative values * otherwise. * */ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int iter = -1; u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; u32 array_cnt = 0; u32 cat_size = 0; /* make sure we don't overflow the 'array[]' variable */ if (net_cat_len > (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) return -ENOSPC; for (;;) { iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1); if (iter < 0) break; cat_size += (iter == 0 ? 0 : sizeof(u16)); if (cat_size > net_cat_len) return -ENOSPC; array[array_cnt++] = iter; iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter); if (iter < 0) return -EFAULT; cat_size += sizeof(u16); if (cat_size > net_cat_len) return -ENOSPC; array[array_cnt++] = iter; } for (iter = 0; array_cnt > 0;) { *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]); iter += 2; array_cnt--; if (array[array_cnt] != 0) { *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]); iter += 2; } } return cat_size; } /** * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category list in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO category list to the correct * local MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; u32 net_iter; u16 cat_low; u16 cat_high; for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) { cat_high = get_unaligned_be16(&net_cat[net_iter]); if ((net_iter + 4) <= net_cat_len) cat_low = get_unaligned_be16(&net_cat[net_iter + 2]); else cat_low = 0; ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat, cat_low, cat_high, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return 0; } /* * Protocol Handling Functions */ /** * cipso_v4_gentag_hdr - Generate a CIPSO option header * @doi_def: the DOI definition * @len: the total tag length in bytes, not including this header * @buf: the CIPSO option buffer * * Description: * Write a CIPSO header into the beginning of @buffer. * */ static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def, unsigned char *buf, u32 len) { buf[0] = IPOPT_CIPSO; buf[1] = CIPSO_V4_HDR_LEN + len; put_unaligned_be32(doi_def->doi, &buf[2]); } /** * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the restricted bitmap tag, tag type #1. The * actual buffer length may be larger than the indicated size due to * translation between host and network category bitmaps. Returns the size of * the tag on success, negative values on failure. * */ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_rbm_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; /* This will send packets using the "optimized" format when * possible as specified in section 3.4.2.6 of the * CIPSO draft. */ if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 && ret_val <= 10) tag_len = 14; else tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_RBITMAP; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security * attributes in @secattr. Return zero on success, negatives values on * failure. * */ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the * size of the tag on success, negative values on failure. * */ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_enum_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_ENUM; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO enumerated tag (tag type #2) and return the security * attributes in @secattr. Return zero on success, negatives values on * failure. * */ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the ranged tag, tag type #5. Returns the * size of the tag on success, negative values on failure. * */ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_rng_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_RANGE; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO ranged tag (tag type #5) and return the security attributes * in @secattr. Return zero on success, negatives values on failure. * */ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the local tag. Returns the size of the tag * on success, negative values on failure. * */ static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { if (!(secattr->flags & NETLBL_SECATTR_SECID)) return -EPERM; buffer[0] = CIPSO_V4_TAG_LOCAL; buffer[1] = CIPSO_V4_TAG_LOC_BLEN; *(u32 *)&buffer[2] = secattr->attr.secid; return CIPSO_V4_TAG_LOC_BLEN; } /** * cipso_v4_parsetag_loc - Parse a CIPSO local tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO local tag and return the security attributes in @secattr. * Return zero on success, negatives values on failure. * */ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { secattr->attr.secid = *(u32 *)&tag[2]; secattr->flags |= NETLBL_SECATTR_SECID; return 0; } /** * cipso_v4_optptr - Find the CIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CIPSO option. Returns a pointer * to the start of the CIPSO option on success, NULL if one is not found. * */ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]); int optlen; int taglen; for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) { switch (optptr[0]) { case IPOPT_END: return NULL; case IPOPT_NOOP: taglen = 1; break; default: taglen = optptr[1]; } if (!taglen || taglen > optlen) return NULL; if (optptr[0] == IPOPT_CIPSO) return optptr; optlen -= taglen; optptr += taglen; } return NULL; } /** * cipso_v4_validate - Validate a CIPSO option * @skb: the packet * @option: the start of the option, on error it is set to point to the error * * Description: * This routine is called to validate a CIPSO option, it checks all of the * fields to ensure that they are at least valid, see the draft snippet below * for details. If the option is valid then a zero value is returned and * the value of @option is unchanged. If the option is invalid then a * non-zero value is returned and @option is adjusted to point to the * offending portion of the option. From the IETF draft ... * * "If any field within the CIPSO options, such as the DOI identifier, is not * recognized the IP datagram is discarded and an ICMP 'parameter problem' * (type 12) is generated and returned. The ICMP code field is set to 'bad * parameter' (code 0) and the pointer is set to the start of the CIPSO field * that is unrecognized." * */ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) { unsigned char *opt = *option; unsigned char *tag; unsigned char opt_iter; unsigned char err_offset = 0; u8 opt_len; u8 tag_len; struct cipso_v4_doi *doi_def = NULL; u32 tag_iter; /* caller already checks for length values that are too large */ opt_len = opt[1]; if (opt_len < 8) { err_offset = 1; goto validate_return; } rcu_read_lock(); doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2])); if (!doi_def) { err_offset = 2; goto validate_return_locked; } opt_iter = CIPSO_V4_HDR_LEN; tag = opt + opt_iter; while (opt_iter < opt_len) { for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];) if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID || ++tag_iter == CIPSO_V4_TAG_MAXCNT) { err_offset = opt_iter; goto validate_return_locked; } if (opt_iter + 1 == opt_len) { err_offset = opt_iter; goto validate_return_locked; } tag_len = tag[1]; if (tag_len > (opt_len - opt_iter)) { err_offset = opt_iter + 1; goto validate_return_locked; } switch (tag[0]) { case CIPSO_V4_TAG_RBITMAP: if (tag_len < CIPSO_V4_TAG_RBM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } /* We are already going to do all the verification * necessary at the socket layer so from our point of * view it is safe to turn these checks off (and less * work), however, the CIPSO draft says we should do * all the CIPSO validations here but it doesn't * really specify _exactly_ what we need to validate * ... so, just make it a sysctl tunable. */ if (READ_ONCE(cipso_v4_rbm_strictvalid)) { if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_RBM_BLEN && cipso_v4_map_cat_rbm_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } } break; case CIPSO_V4_TAG_ENUM: if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_ENUM_BLEN && cipso_v4_map_cat_enum_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } break; case CIPSO_V4_TAG_RANGE: if (tag_len < CIPSO_V4_TAG_RNG_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_RNG_BLEN && cipso_v4_map_cat_rng_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } break; case CIPSO_V4_TAG_LOCAL: /* This is a non-standard tag that we only allow for * local connections, so if the incoming interface is * not the loopback device drop the packet. Further, * there is no legitimate reason for setting this from * userspace so reject it if skb is NULL. */ if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) { err_offset = opt_iter; goto validate_return_locked; } if (tag_len != CIPSO_V4_TAG_LOC_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } break; default: err_offset = opt_iter; goto validate_return_locked; } tag += tag_len; opt_iter += tag_len; } validate_return_locked: rcu_read_unlock(); validate_return: *option = opt + err_offset; return err_offset; } /** * cipso_v4_error - Send the correct response for a bad packet * @skb: the packet * @error: the error code * @gateway: CIPSO gateway flag * * Description: * Based on the error code given in @error, send an ICMP error message back to * the originating host. From the IETF draft ... * * "If the contents of the CIPSO [option] are valid but the security label is * outside of the configured host or port label range, the datagram is * discarded and an ICMP 'destination unreachable' (type 3) is generated and * returned. The code field of the ICMP is set to 'communication with * destination network administratively prohibited' (code 9) or to * 'communication with destination host administratively prohibited' * (code 10). The value of the code is dependent on whether the originator * of the ICMP message is acting as a CIPSO host or a CIPSO gateway. The * recipient of the ICMP message MUST be able to handle either value. The * same procedure is performed if a CIPSO [option] can not be added to an * IP packet because it is too large to fit in the IP options area." * * "If the error is triggered by receipt of an ICMP message, the message is * discarded and no response is permitted (consistent with general ICMP * processing rules)." * */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; /* * We might be called above the IP layer, * so we can not use icmp_send and IPCB here. */ memset(opt, 0, sizeof(struct ip_options)); opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); rcu_read_lock(); res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); else __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); } /** * cipso_v4_genopt - Generate a CIPSO option * @buf: the option buffer * @buf_len: the size of opt_buf * @doi_def: the CIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CIPSO option using the DOI definition and security attributes * passed to the function. Returns the length of the option on success and * negative values on failure. * */ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 iter; if (buf_len <= CIPSO_V4_HDR_LEN) return -ENOSPC; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ iter = 0; do { memset(buf, 0, buf_len); switch (doi_def->tags[iter]) { case CIPSO_V4_TAG_RBITMAP: ret_val = cipso_v4_gentag_rbm(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_ENUM: ret_val = cipso_v4_gentag_enum(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_gentag_rng(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_LOCAL: ret_val = cipso_v4_gentag_loc(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; default: return -EPERM; } iter++; } while (ret_val < 0 && iter < CIPSO_V4_TAG_MAXCNT && doi_def->tags[iter] != CIPSO_V4_TAG_INVALID); if (ret_val < 0) return ret_val; cipso_v4_gentag_hdr(doi_def, buf, ret_val); return CIPSO_V4_HDR_LEN + ret_val; } /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; unsigned char *buf = NULL; u32 buf_len; u32 opt_len; struct ip_options_rcu *old, *opt = NULL; struct inet_sock *sk_inet; struct inet_connection_sock *sk_conn; /* In the case of sock_create_lite(), the sock->sk field is not * defined yet but it is not a problem as the only users of these * "lite" PF_INET sockets are functions which do an accept() call * afterwards so we will label the socket as part of the accept(). */ if (!sk) return 0; /* We allocate the maximum CIPSO option size here so we are probably * being a little wasteful, but it makes our life _much_ easier later * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret_val = -ENOMEM; goto socket_setattr_failure; } ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) goto socket_setattr_failure; buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and * we won't always have CAP_NET_RAW even though we _always_ want to * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); if (!opt) { ret_val = -ENOMEM; goto socket_setattr_failure; } memcpy(opt->opt.__data, buf, buf_len); opt->opt.optlen = opt_len; opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; sk_inet = inet_sk(sk); old = rcu_dereference_protected(sk_inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_test_bit(IS_ICSK, sk)) { sk_conn = inet_csk(sk); if (old) sk_conn->icsk_ext_hdr_len -= old->opt.optlen; sk_conn->icsk_ext_hdr_len += opt->opt.optlen; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } rcu_assign_pointer(sk_inet->inet_opt, opt); if (old) kfree_rcu(old, rcu); return 0; socket_setattr_failure: kfree(buf); kfree(opt); return ret_val; } /** * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ int cipso_v4_req_setattr(struct request_sock *req, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; unsigned char *buf = NULL; u32 buf_len; u32 opt_len; struct ip_options_rcu *opt = NULL; struct inet_request_sock *req_inet; /* We allocate the maximum CIPSO option size here so we are probably * being a little wasteful, but it makes our life _much_ easier later * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret_val = -ENOMEM; goto req_setattr_failure; } ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) goto req_setattr_failure; buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and * we won't always have CAP_NET_RAW even though we _always_ want to * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); if (!opt) { ret_val = -ENOMEM; goto req_setattr_failure; } memcpy(opt->opt.__data, buf, buf_len); opt->opt.optlen = opt_len; opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; req_inet = inet_rsk(req); opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); if (opt) kfree_rcu(opt, rcu); return 0; req_setattr_failure: kfree(buf); kfree(opt); return ret_val; } /** * cipso_v4_delopt - Delete the CIPSO option from a set of IP options * @opt_ptr: IP option pointer * * Description: * Deletes the CIPSO IP option from a set of IP options and makes the necessary * adjustments to the IP option structure. Returns zero on success, negative * values on failure. * */ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) { struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1); int hdr_delta = 0; if (!opt || opt->opt.cipso == 0) return 0; if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; int iter; int optlen_new; cipso_off = opt->opt.cipso - sizeof(struct iphdr); cipso_ptr = &opt->opt.__data[cipso_off]; cipso_len = cipso_ptr[1]; if (opt->opt.srr > opt->opt.cipso) opt->opt.srr -= cipso_len; if (opt->opt.rr > opt->opt.cipso) opt->opt.rr -= cipso_len; if (opt->opt.ts > opt->opt.cipso) opt->opt.ts -= cipso_len; if (opt->opt.router_alert > opt->opt.cipso) opt->opt.router_alert -= cipso_len; opt->opt.cipso = 0; memmove(cipso_ptr, cipso_ptr + cipso_len, opt->opt.optlen - cipso_off - cipso_len); /* determining the new total option length is tricky because of * the padding necessary, the only thing i can think to do at * this point is walk the options one-by-one, skipping the * padding at the end to determine the actual option size and * from there we can determine the new total option length */ iter = 0; optlen_new = 0; while (iter < opt->opt.optlen) if (opt->opt.__data[iter] != IPOPT_NOP) { iter += opt->opt.__data[iter + 1]; optlen_new = iter; } else iter++; hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; } else { /* only the cipso option was present on the socket so we can * remove the entire option struct */ *opt_ptr = NULL; hdr_delta = opt->opt.optlen; kfree_rcu(opt, rcu); } return hdr_delta; } /** * cipso_v4_sock_delattr - Delete the CIPSO option from a socket * @sk: the socket * * Description: * Removes the CIPSO option from a socket, if present. * */ void cipso_v4_sock_delattr(struct sock *sk) { struct inet_sock *sk_inet; int hdr_delta; sk_inet = inet_sk(sk); hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) { struct inet_connection_sock *sk_conn = inet_csk(sk); sk_conn->icsk_ext_hdr_len -= hdr_delta; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } } /** * cipso_v4_req_delattr - Delete the CIPSO option from a request socket * @req: the request socket * * Description: * Removes the CIPSO option from a request socket, if present. * */ void cipso_v4_req_delattr(struct request_sock *req) { cipso_v4_delopt(&inet_rsk(req)->ireq_opt); } /** * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions * @cipso: the CIPSO v4 option * @secattr: the security attributes * * Description: * Inspect @cipso and return the security attributes in @secattr. Returns zero * on success and negative values on failure. * */ int cipso_v4_getattr(const unsigned char *cipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi; struct cipso_v4_doi *doi_def; if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(&cipso[2]); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (!doi_def) goto getattr_return; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ switch (cipso[6]) { case CIPSO_V4_TAG_RBITMAP: ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_ENUM: ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_LOCAL: ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr); break; } if (ret_val == 0) secattr->type = NETLBL_NLTYPE_CIPSOV4; getattr_return: rcu_read_unlock(); return ret_val; } /** * cipso_v4_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CIPSO option attached to the sock and if * there is return the CIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ip_options_rcu *opt; int res = -ENOMSG; rcu_read_lock(); opt = rcu_dereference(inet_sk(sk)->inet_opt); if (opt && opt->opt.cipso) res = cipso_v4_getattr(opt->opt.__data + opt->opt.cipso - sizeof(struct iphdr), secattr); rcu_read_unlock(); return res; } /** * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet * @skb: the packet * @doi_def: the DOI structure * @secattr: the security attributes * * Description: * Set the CIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ int cipso_v4_skbuff_setattr(struct sk_buff *skb, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char buf[CIPSO_V4_OPT_LEN_MAX]; u32 buf_len = CIPSO_V4_OPT_LEN_MAX; u32 opt_len; int len_delta; ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) return ret_val; buf_len = ret_val; opt_len = (buf_len + 3) & ~3; /* we overwrite any existing options to ensure that we have enough * room for the CIPSO option, the reason is that we _need_ to guarantee * that the security label is applied to the packet - we do the same * thing when using the socket options and it hasn't caused a problem, * if we need to we can always revisit this choice later */ len_delta = opt_len - opt->optlen; /* if we don't ensure enough headroom we could panic on the skb_push() * call below so make sure we have enough, we are also "mangling" the * packet so we should probably do a copy-on-write call anyway */ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; if (len_delta > 0) { /* we assume that the header + opt->optlen have already been * "pushed" in ip_options_build() or similar */ iph = ip_hdr(skb); skb_push(skb, len_delta); memmove((char *)iph - len_delta, iph, iph->ihl << 2); skb_reset_network_header(skb); iph = ip_hdr(skb); } else if (len_delta < 0) { iph = ip_hdr(skb); memset(iph + 1, IPOPT_NOP, opt->optlen); } else iph = ip_hdr(skb); if (opt->optlen > 0) memset(opt, 0, sizeof(*opt)); opt->optlen = opt_len; opt->cipso = sizeof(struct iphdr); opt->is_changed = 1; /* we have to do the following because we are being called from a * netfilter hook which means the packet already has had the header * fields populated and the checksum calculated - yes this means we * are doing more work than needed but we do it to keep the core * stack clean and tidy */ memcpy(iph + 1, buf, buf_len); if (opt_len > buf_len) memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); if (len_delta != 0) { iph->ihl = 5 + (opt_len >> 2); iph_set_totlen(iph, skb->len); } ip_send_check(iph); return 0; } /** * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ int cipso_v4_skbuff_delattr(struct sk_buff *skb) { int ret_val; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char *cipso_ptr; if (opt->cipso == 0) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; /* the easiest thing to do is just replace the cipso option with noop * options since we don't change the size of the packet, although we * still need to recalculate the checksum */ iph = ip_hdr(skb); cipso_ptr = (unsigned char *)iph + opt->cipso; memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); opt->cipso = 0; opt->is_changed = 1; ip_send_check(iph); return 0; } /* * Setup Functions */ /** * cipso_v4_init - Initialize the CIPSO module * * Description: * Initialize the CIPSO module and prepare it for use. Returns zero on success * and negative values on failure. * */ static int __init cipso_v4_init(void) { int ret_val; ret_val = cipso_v4_cache_init(); if (ret_val != 0) panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n", ret_val); return 0; } subsys_initcall(cipso_v4_init); |
| 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * Copyright (C) 2014 Marvell International Ltd. * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> */ #ifndef __NET_NFC_H #define __NET_NFC_H #include <linux/nfc.h> #include <linux/device.h> #include <linux/skbuff.h> #define nfc_dbg(dev, fmt, ...) dev_dbg((dev), "NFC: " fmt, ##__VA_ARGS__) #define nfc_info(dev, fmt, ...) dev_info((dev), "NFC: " fmt, ##__VA_ARGS__) #define nfc_err(dev, fmt, ...) dev_err((dev), "NFC: " fmt, ##__VA_ARGS__) struct nfc_phy_ops { int (*write)(void *dev_id, struct sk_buff *skb); int (*enable)(void *dev_id); void (*disable)(void *dev_id); }; struct nfc_dev; /** * data_exchange_cb_t - Definition of nfc_data_exchange callback * * @context: nfc_data_exchange cb_context parameter * @skb: response data * @err: If an error has occurred during data exchange, it is the * error number. Zero means no error. * * When a rx or tx package is lost or corrupted or the target gets out * of the operating field, err is -EIO. */ typedef void (*data_exchange_cb_t)(void *context, struct sk_buff *skb, int err); typedef void (*se_io_cb_t)(void *context, u8 *apdu, size_t apdu_len, int err); struct nfc_target; struct nfc_ops { int (*dev_up)(struct nfc_dev *dev); int (*dev_down)(struct nfc_dev *dev); int (*start_poll)(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols); void (*stop_poll)(struct nfc_dev *dev); int (*dep_link_up)(struct nfc_dev *dev, struct nfc_target *target, u8 comm_mode, u8 *gb, size_t gb_len); int (*dep_link_down)(struct nfc_dev *dev); int (*activate_target)(struct nfc_dev *dev, struct nfc_target *target, u32 protocol); void (*deactivate_target)(struct nfc_dev *dev, struct nfc_target *target, u8 mode); int (*im_transceive)(struct nfc_dev *dev, struct nfc_target *target, struct sk_buff *skb, data_exchange_cb_t cb, void *cb_context); int (*tm_send)(struct nfc_dev *dev, struct sk_buff *skb); int (*check_presence)(struct nfc_dev *dev, struct nfc_target *target); int (*fw_download)(struct nfc_dev *dev, const char *firmware_name); /* Secure Element API */ int (*discover_se)(struct nfc_dev *dev); int (*enable_se)(struct nfc_dev *dev, u32 se_idx); int (*disable_se)(struct nfc_dev *dev, u32 se_idx); int (*se_io) (struct nfc_dev *dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context); }; #define NFC_TARGET_IDX_ANY -1 #define NFC_MAX_GT_LEN 48 #define NFC_ATR_RES_GT_OFFSET 15 #define NFC_ATR_REQ_GT_OFFSET 14 /** * struct nfc_target - NFC target descriptiom * * @sens_res: 2 bytes describing the target SENS_RES response, if the target * is a type A one. The %sens_res most significant byte must be byte 2 * as described by the NFC Forum digital specification (i.e. the platform * configuration one) while %sens_res least significant byte is byte 1. */ struct nfc_target { u32 idx; u32 supported_protocols; u16 sens_res; u8 sel_res; u8 nfcid1_len; u8 nfcid1[NFC_NFCID1_MAXSIZE]; u8 nfcid2_len; u8 nfcid2[NFC_NFCID2_MAXSIZE]; u8 sensb_res_len; u8 sensb_res[NFC_SENSB_RES_MAXSIZE]; u8 sensf_res_len; u8 sensf_res[NFC_SENSF_RES_MAXSIZE]; u8 hci_reader_gate; u8 logical_idx; u8 is_iso15693; u8 iso15693_dsfid; u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE]; }; /** * nfc_se - A structure for NFC accessible secure elements. * * @idx: The secure element index. User space will enable or * disable a secure element by its index. * @type: The secure element type. It can be SE_UICC or * SE_EMBEDDED. * @state: The secure element state, either enabled or disabled. * */ struct nfc_se { struct list_head list; u32 idx; u16 type; u16 state; }; /** * nfc_evt_transaction - A struct for NFC secure element event transaction. * * @aid: The application identifier triggering the event * * @aid_len: The application identifier length [5:16] * * @params: The application parameters transmitted during the transaction * * @params_len: The applications parameters length [0:255] * */ #define NFC_MIN_AID_LENGTH 5 #define NFC_MAX_AID_LENGTH 16 #define NFC_MAX_PARAMS_LENGTH 255 #define NFC_EVT_TRANSACTION_AID_TAG 0x81 #define NFC_EVT_TRANSACTION_PARAMS_TAG 0x82 struct nfc_evt_transaction { u32 aid_len; u8 aid[NFC_MAX_AID_LENGTH]; u8 params_len; u8 params[]; } __packed; struct nfc_genl_data { u32 poll_req_portid; struct mutex genl_data_mutex; }; struct nfc_vendor_cmd { __u32 vendor_id; __u32 subcmd; int (*doit)(struct nfc_dev *dev, void *data, size_t data_len); }; struct nfc_dev { int idx; u32 target_next_idx; struct nfc_target *targets; int n_targets; int targets_generation; struct device dev; bool dev_up; bool fw_download_in_progress; u8 rf_mode; bool polling; struct nfc_target *active_target; bool dep_link_up; struct nfc_genl_data genl_data; u32 supported_protocols; struct list_head secure_elements; int tx_headroom; int tx_tailroom; struct timer_list check_pres_timer; struct work_struct check_pres_work; bool shutting_down; struct rfkill *rfkill; const struct nfc_vendor_cmd *vendor_cmds; int n_vendor_cmds; const struct nfc_ops *ops; struct genl_info *cur_cmd_info; }; #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev) extern struct class nfc_class; struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom); /** * nfc_free_device - free nfc device * * @dev: The nfc device to free */ static inline void nfc_free_device(struct nfc_dev *dev) { put_device(&dev->dev); } int nfc_register_device(struct nfc_dev *dev); void nfc_unregister_device(struct nfc_dev *dev); /** * nfc_set_parent_dev - set the parent device * * @nfc_dev: The nfc device whose parent is being set * @dev: The parent device */ static inline void nfc_set_parent_dev(struct nfc_dev *nfc_dev, struct device *dev) { nfc_dev->dev.parent = dev; } /** * nfc_set_drvdata - set driver specifc data * * @dev: The nfc device * @data: Pointer to driver specifc data */ static inline void nfc_set_drvdata(struct nfc_dev *dev, void *data) { dev_set_drvdata(&dev->dev, data); } /** * nfc_get_drvdata - get driver specifc data * * @dev: The nfc device */ static inline void *nfc_get_drvdata(const struct nfc_dev *dev) { return dev_get_drvdata(&dev->dev); } /** * nfc_device_name - get the nfc device name * * @dev: The nfc device whose name to return */ static inline const char *nfc_device_name(const struct nfc_dev *dev) { return dev_name(&dev->dev); } struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, unsigned int *err); struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp); int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gt, u8 gt_len); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len); int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result); int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int ntargets); int nfc_target_lost(struct nfc_dev *dev, u32 target_idx); int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, const u8 *gb, size_t gb_len); int nfc_tm_deactivated(struct nfc_dev *dev); int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb); void nfc_driver_failure(struct nfc_dev *dev, int err); int nfc_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction); int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx); int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type); int nfc_remove_se(struct nfc_dev *dev, u32 se_idx); struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx); void nfc_send_to_raw_sock(struct nfc_dev *dev, struct sk_buff *skb, u8 payload_type, u8 direction); static inline int nfc_set_vendor_cmds(struct nfc_dev *dev, const struct nfc_vendor_cmd *cmds, int n_cmds) { if (dev->vendor_cmds || dev->n_vendor_cmds) return -EINVAL; dev->vendor_cmds = cmds; dev->n_vendor_cmds = n_cmds; return 0; } struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, enum nfc_attrs attr, u32 oui, u32 subcmd, int approxlen); int nfc_vendor_cmd_reply(struct sk_buff *skb); /** * nfc_vendor_cmd_alloc_reply_skb - allocate vendor command reply * @dev: nfc device * @oui: vendor oui * @approxlen: an upper bound of the length of the data that will * be put into the skb * * This function allocates and pre-fills an skb for a reply to * a vendor command. Since it is intended for a reply, calling * it outside of a vendor command's doit() operation is invalid. * * The returned skb is pre-filled with some identifying data in * a way that any data that is put into the skb (with skb_put(), * nla_put() or similar) will end up being within the * %NFC_ATTR_VENDOR_DATA attribute, so all that needs to be done * with the skb is adding data for the corresponding userspace tool * which can then read that data out of the vendor data attribute. * You must not modify the skb in any other way. * * When done, call nfc_vendor_cmd_reply() with the skb and return * its error code as the result of the doit() operation. * * Return: An allocated and pre-filled skb. %NULL if any errors happen. */ static inline struct sk_buff * nfc_vendor_cmd_alloc_reply_skb(struct nfc_dev *dev, u32 oui, u32 subcmd, int approxlen) { return __nfc_alloc_vendor_cmd_reply_skb(dev, NFC_ATTR_VENDOR_DATA, oui, subcmd, approxlen); } #endif /* __NET_NFC_H */ |
| 4 4 5 5 5 1 1 1 1 2 2 2 2 1 1 1 2 2 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 | // SPDX-License-Identifier: GPL-2.0-or-later /* xfrm4_protocol.c - Generic xfrm protocol multiplexer. * * Copyright (C) 2013 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> * * Based on: * net/ipv4/tunnel4.c */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/skbuff.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm4_protocol __rcu *esp4_handlers __read_mostly; static struct xfrm4_protocol __rcu *ah4_handlers __read_mostly; static struct xfrm4_protocol __rcu *ipcomp4_handlers __read_mostly; static DEFINE_MUTEX(xfrm4_protocol_mutex); static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol) { switch (protocol) { case IPPROTO_ESP: return &esp4_handlers; case IPPROTO_AH: return &ah4_handlers; case IPPROTO_COMP: return &ipcomp4_handlers; } return NULL; } #define for_each_protocol_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err) { int ret; struct xfrm4_protocol *handler; struct xfrm4_protocol __rcu **head = proto_handlers(protocol); if (!head) return 0; for_each_protocol_rcu(*head, handler) if ((ret = handler->cb_handler(skb, err)) <= 0) return ret; return 0; } int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { int ret; struct xfrm4_protocol *handler; struct xfrm4_protocol __rcu **head = proto_handlers(nexthdr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); if (!head) goto out; if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev)) goto drop; } for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } EXPORT_SYMBOL(xfrm4_rcv_encap); static int xfrm4_esp_rcv(struct sk_buff *skb) { int ret; struct xfrm4_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; for_each_protocol_rcu(esp4_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm4_esp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(esp4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } static int xfrm4_ah_rcv(struct sk_buff *skb) { int ret; struct xfrm4_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; for_each_protocol_rcu(ah4_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm4_ah_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ah4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } static int xfrm4_ipcomp_rcv(struct sk_buff *skb) { int ret; struct xfrm4_protocol *handler; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; for_each_protocol_rcu(ipcomp4_handlers, handler) if ((ret = handler->handler(skb)) != -EINVAL) return ret; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); kfree_skb(skb); return 0; } static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ipcomp4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } static const struct net_protocol esp4_protocol = { .handler = xfrm4_esp_rcv, .err_handler = xfrm4_esp_err, .no_policy = 1, }; static const struct net_protocol ah4_protocol = { .handler = xfrm4_ah_rcv, .err_handler = xfrm4_ah_err, .no_policy = 1, }; static const struct net_protocol ipcomp4_protocol = { .handler = xfrm4_ipcomp_rcv, .err_handler = xfrm4_ipcomp_err, .no_policy = 1, }; static const struct xfrm_input_afinfo xfrm4_input_afinfo = { .family = AF_INET, .callback = xfrm4_rcv_cb, }; static inline const struct net_protocol *netproto(unsigned char protocol) { switch (protocol) { case IPPROTO_ESP: return &esp4_protocol; case IPPROTO_AH: return &ah4_protocol; case IPPROTO_COMP: return &ipcomp4_protocol; } return NULL; } int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol) { struct xfrm4_protocol __rcu **pprev; struct xfrm4_protocol *t; bool add_netproto = false; int ret = -EEXIST; int priority = handler->priority; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm4_protocol_mutex); if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm4_protocol_mutex))) add_netproto = true; for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; pprev = &t->next) { if (t->priority < priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&xfrm4_protocol_mutex); if (add_netproto) { if (inet_add_protocol(netproto(protocol), protocol)) { pr_err("%s: can't add protocol\n", __func__); ret = -EAGAIN; } } return ret; } EXPORT_SYMBOL(xfrm4_protocol_register); int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol) { struct xfrm4_protocol __rcu **pprev; struct xfrm4_protocol *t; int ret = -ENOENT; if (!proto_handlers(protocol) || !netproto(protocol)) return -EINVAL; mutex_lock(&xfrm4_protocol_mutex); for (pprev = proto_handlers(protocol); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&xfrm4_protocol_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } if (!rcu_dereference_protected(*proto_handlers(protocol), lockdep_is_held(&xfrm4_protocol_mutex))) { if (inet_del_protocol(netproto(protocol), protocol) < 0) { pr_err("%s: can't remove protocol\n", __func__); ret = -EAGAIN; } } mutex_unlock(&xfrm4_protocol_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm4_protocol_deregister); void __init xfrm4_protocol_init(void) { xfrm_input_register_afinfo(&xfrm4_input_afinfo); } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 | // SPDX-License-Identifier: GPL-2.0-or-later /* Basic authentication token and access key management * * Copyright (C) 2004-2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/poison.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/workqueue.h> #include <linux/random.h> #include <linux/ima.h> #include <linux/err.h> #include "internal.h" struct kmem_cache *key_jar; struct rb_root key_serial_tree; /* tree of keys indexed by serial */ DEFINE_SPINLOCK(key_serial_lock); struct rb_root key_user_tree; /* tree of quota records indexed by UID */ DEFINE_SPINLOCK(key_user_lock); unsigned int key_quota_root_maxkeys = 1000000; /* root's key count quota */ unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */ unsigned int key_quota_maxkeys = 200; /* general key count quota */ unsigned int key_quota_maxbytes = 20000; /* general key space quota */ static LIST_HEAD(key_types_list); static DECLARE_RWSEM(key_types_sem); /* We serialise key instantiation and link */ DEFINE_MUTEX(key_construction_mutex); #ifdef KEY_DEBUGGING void __key_check(const struct key *key) { printk("__key_check: key %p {%08x} should be {%08x}\n", key, key->magic, KEY_DEBUG_MAGIC); BUG(); } #endif /* * Get the key quota record for a user, allocating a new record if one doesn't * already exist. */ struct key_user *key_user_lookup(kuid_t uid) { struct key_user *candidate = NULL, *user; struct rb_node *parent, **p; try_again: parent = NULL; p = &key_user_tree.rb_node; spin_lock(&key_user_lock); /* search the tree for a user record with a matching UID */ while (*p) { parent = *p; user = rb_entry(parent, struct key_user, node); if (uid_lt(uid, user->uid)) p = &(*p)->rb_left; else if (uid_gt(uid, user->uid)) p = &(*p)->rb_right; else goto found; } /* if we get here, we failed to find a match in the tree */ if (!candidate) { /* allocate a candidate user record if we don't already have * one */ spin_unlock(&key_user_lock); user = NULL; candidate = kmalloc(sizeof(struct key_user), GFP_KERNEL); if (unlikely(!candidate)) goto out; /* the allocation may have scheduled, so we need to repeat the * search lest someone else added the record whilst we were * asleep */ goto try_again; } /* if we get here, then the user record still hadn't appeared on the * second pass - so we use the candidate record */ refcount_set(&candidate->usage, 1); atomic_set(&candidate->nkeys, 0); atomic_set(&candidate->nikeys, 0); candidate->uid = uid; candidate->qnkeys = 0; candidate->qnbytes = 0; spin_lock_init(&candidate->lock); mutex_init(&candidate->cons_lock); rb_link_node(&candidate->node, parent, p); rb_insert_color(&candidate->node, &key_user_tree); spin_unlock(&key_user_lock); user = candidate; goto out; /* okay - we found a user record for this UID */ found: refcount_inc(&user->usage); spin_unlock(&key_user_lock); kfree(candidate); out: return user; } /* * Dispose of a user structure */ void key_user_put(struct key_user *user) { if (refcount_dec_and_lock(&user->usage, &key_user_lock)) { rb_erase(&user->node, &key_user_tree); spin_unlock(&key_user_lock); kfree(user); } } /* * Allocate a serial number for a key. These are assigned randomly to avoid * security issues through covert channel problems. */ static inline void key_alloc_serial(struct key *key) { struct rb_node *parent, **p; struct key *xkey; /* propose a random serial number and look for a hole for it in the * serial number tree */ do { get_random_bytes(&key->serial, sizeof(key->serial)); key->serial >>= 1; /* negative numbers are not permitted */ } while (key->serial < 3); spin_lock(&key_serial_lock); attempt_insertion: parent = NULL; p = &key_serial_tree.rb_node; while (*p) { parent = *p; xkey = rb_entry(parent, struct key, serial_node); if (key->serial < xkey->serial) p = &(*p)->rb_left; else if (key->serial > xkey->serial) p = &(*p)->rb_right; else goto serial_exists; } /* we've found a suitable hole - arrange for this key to occupy it */ rb_link_node(&key->serial_node, parent, p); rb_insert_color(&key->serial_node, &key_serial_tree); spin_unlock(&key_serial_lock); return; /* we found a key with the proposed serial number - walk the tree from * that point looking for the next unused serial number */ serial_exists: for (;;) { key->serial++; if (key->serial < 3) { key->serial = 3; goto attempt_insertion; } parent = rb_next(parent); if (!parent) goto attempt_insertion; xkey = rb_entry(parent, struct key, serial_node); if (key->serial < xkey->serial) goto attempt_insertion; } } /** * key_alloc - Allocate a key of the specified type. * @type: The type of key to allocate. * @desc: The key description to allow the key to be searched out. * @uid: The owner of the new key. * @gid: The group ID for the new key's group permissions. * @cred: The credentials specifying UID namespace. * @perm: The permissions mask of the new key. * @flags: Flags specifying quota properties. * @restrict_link: Optional link restriction for new keyrings. * * Allocate a key of the specified type with the attributes given. The key is * returned in an uninstantiated state and the caller needs to instantiate the * key before returning. * * The restrict_link structure (if not NULL) will be freed when the * keyring is destroyed, so it must be dynamically allocated. * * The user's key count quota is updated to reflect the creation of the key and * the user's key data quota has the default for the key type reserved. The * instantiation function should amend this as necessary. If insufficient * quota is available, -EDQUOT will be returned. * * The LSM security modules can prevent a key being created, in which case * -EACCES will be returned. * * Returns a pointer to the new key if successful and an error code otherwise. * * Note that the caller needs to ensure the key type isn't uninstantiated. * Internally this can be done by locking key_types_sem. Externally, this can * be done by either never unregistering the key type, or making sure * key_alloc() calls don't race with module unloading. */ struct key *key_alloc(struct key_type *type, const char *desc, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link) { struct key_user *user = NULL; struct key *key; size_t desclen, quotalen; int ret; key = ERR_PTR(-EINVAL); if (!desc || !*desc) goto error; if (type->vet_description) { ret = type->vet_description(desc); if (ret < 0) { key = ERR_PTR(ret); goto error; } } desclen = strlen(desc); quotalen = desclen + 1 + type->def_datalen; /* get hold of the key tracking for this user */ user = key_user_lookup(uid); if (!user) goto no_memory_1; /* check that the user's quota permits allocation of another key and * its description */ if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { unsigned maxkeys = uid_eq(uid, GLOBAL_ROOT_UID) ? key_quota_root_maxkeys : key_quota_maxkeys; unsigned maxbytes = uid_eq(uid, GLOBAL_ROOT_UID) ? key_quota_root_maxbytes : key_quota_maxbytes; spin_lock(&user->lock); if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) { if (user->qnkeys + 1 > maxkeys || user->qnbytes + quotalen > maxbytes || user->qnbytes + quotalen < user->qnbytes) goto no_quota; } user->qnkeys++; user->qnbytes += quotalen; spin_unlock(&user->lock); } /* allocate and initialise the key and its description */ key = kmem_cache_zalloc(key_jar, GFP_KERNEL); if (!key) goto no_memory_2; key->index_key.desc_len = desclen; key->index_key.description = kmemdup(desc, desclen + 1, GFP_KERNEL); if (!key->index_key.description) goto no_memory_3; key->index_key.type = type; key_set_index_key(&key->index_key); refcount_set(&key->usage, 1); init_rwsem(&key->sem); lockdep_set_class(&key->sem, &type->lock_class); key->user = user; key->quotalen = quotalen; key->datalen = type->def_datalen; key->uid = uid; key->gid = gid; key->perm = perm; key->expiry = TIME64_MAX; key->restrict_link = restrict_link; key->last_used_at = ktime_get_real_seconds(); if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_BUILT_IN) key->flags |= 1 << KEY_FLAG_BUILTIN; if (flags & KEY_ALLOC_UID_KEYRING) key->flags |= 1 << KEY_FLAG_UID_KEYRING; if (flags & KEY_ALLOC_SET_KEEP) key->flags |= 1 << KEY_FLAG_KEEP; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; #endif /* let the security module know about the key */ ret = security_key_alloc(key, cred, flags); if (ret < 0) goto security_error; /* publish the key by giving it a serial number */ refcount_inc(&key->domain_tag->usage); atomic_inc(&user->nkeys); key_alloc_serial(key); error: return key; security_error: kfree(key->description); kmem_cache_free(key_jar, key); if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock(&user->lock); user->qnkeys--; user->qnbytes -= quotalen; spin_unlock(&user->lock); } key_user_put(user); key = ERR_PTR(ret); goto error; no_memory_3: kmem_cache_free(key_jar, key); no_memory_2: if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) { spin_lock(&user->lock); user->qnkeys--; user->qnbytes -= quotalen; spin_unlock(&user->lock); } key_user_put(user); no_memory_1: key = ERR_PTR(-ENOMEM); goto error; no_quota: spin_unlock(&user->lock); key_user_put(user); key = ERR_PTR(-EDQUOT); goto error; } EXPORT_SYMBOL(key_alloc); /** * key_payload_reserve - Adjust data quota reservation for the key's payload * @key: The key to make the reservation for. * @datalen: The amount of data payload the caller now wants. * * Adjust the amount of the owning user's key data quota that a key reserves. * If the amount is increased, then -EDQUOT may be returned if there isn't * enough free quota available. * * If successful, 0 is returned. */ int key_payload_reserve(struct key *key, size_t datalen) { int delta = (int)datalen - key->datalen; int ret = 0; key_check(key); /* contemplate the quota adjustment */ if (delta != 0 && test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) { unsigned maxbytes = uid_eq(key->user->uid, GLOBAL_ROOT_UID) ? key_quota_root_maxbytes : key_quota_maxbytes; spin_lock(&key->user->lock); if (delta > 0 && (key->user->qnbytes + delta > maxbytes || key->user->qnbytes + delta < key->user->qnbytes)) { ret = -EDQUOT; } else { key->user->qnbytes += delta; key->quotalen += delta; } spin_unlock(&key->user->lock); } /* change the recorded data length if that didn't generate an error */ if (ret == 0) key->datalen = datalen; return ret; } EXPORT_SYMBOL(key_payload_reserve); /* * Change the key state to being instantiated. */ static void mark_key_instantiated(struct key *key, int reject_error) { /* Commit the payload before setting the state; barrier versus * key_read_state(). */ smp_store_release(&key->state, (reject_error < 0) ? reject_error : KEY_IS_POSITIVE); } /* * Instantiate a key and link it into the target keyring atomically. Must be * called with the target keyring's semaphore writelocked. The target key's * semaphore need not be locked as instantiation is serialised by * key_construction_mutex. */ static int __key_instantiate_and_link(struct key *key, struct key_preparsed_payload *prep, struct key *keyring, struct key *authkey, struct assoc_array_edit **_edit) { int ret, awaken; key_check(key); key_check(keyring); awaken = 0; ret = -EBUSY; mutex_lock(&key_construction_mutex); /* can't instantiate twice */ if (key->state == KEY_IS_UNINSTANTIATED) { /* instantiate the key */ ret = key->type->instantiate(key, prep); if (ret == 0) { /* mark the key as being instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_INSTANTIATED, 0); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; /* and link it into the destination keyring */ if (keyring) { if (test_bit(KEY_FLAG_KEEP, &keyring->flags)) set_bit(KEY_FLAG_KEEP, &key->flags); __key_link(keyring, key, _edit); } /* disable the authorisation key */ if (authkey) key_invalidate(authkey); key_set_expiry(key, prep->expiry); } } mutex_unlock(&key_construction_mutex); /* wake up anyone waiting for a key to be constructed */ if (awaken) wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT); return ret; } /** * key_instantiate_and_link - Instantiate a key and link it into the keyring. * @key: The key to instantiate. * @data: The data to use to instantiate the keyring. * @datalen: The length of @data. * @keyring: Keyring to create a link in on success (or NULL). * @authkey: The authorisation token permitting instantiation. * * Instantiate a key that's in the uninstantiated state using the provided data * and, if successful, link it in to the destination keyring if one is * supplied. * * If successful, 0 is returned, the authorisation token is revoked and anyone * waiting for the key is woken up. If the key was already instantiated, * -EBUSY will be returned. */ int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, struct key *authkey) { struct key_preparsed_payload prep; struct assoc_array_edit *edit = NULL; int ret; memset(&prep, 0, sizeof(prep)); prep.orig_description = key->description; prep.data = data; prep.datalen = datalen; prep.quotalen = key->type->def_datalen; prep.expiry = TIME64_MAX; if (key->type->preparse) { ret = key->type->preparse(&prep); if (ret < 0) goto error; } if (keyring) { ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_link_end; if (keyring->restrict_link && keyring->restrict_link->check) { struct key_restriction *keyres = keyring->restrict_link; ret = keyres->check(keyring, key->type, &prep.payload, keyres->key); if (ret < 0) goto error_link_end; } } ret = __key_instantiate_and_link(key, &prep, keyring, authkey, &edit); error_link_end: if (keyring) __key_link_end(keyring, &key->index_key, edit); error: if (key->type->preparse) key->type->free_preparse(&prep); return ret; } EXPORT_SYMBOL(key_instantiate_and_link); /** * key_reject_and_link - Negatively instantiate a key and link it into the keyring. * @key: The key to instantiate. * @timeout: The timeout on the negative key. * @error: The error to return when the key is hit. * @keyring: Keyring to create a link in on success (or NULL). * @authkey: The authorisation token permitting instantiation. * * Negatively instantiate a key that's in the uninstantiated state and, if * successful, set its timeout and stored error and link it in to the * destination keyring if one is supplied. The key and any links to the key * will be automatically garbage collected after the timeout expires. * * Negative keys are used to rate limit repeated request_key() calls by causing * them to return the stored error code (typically ENOKEY) until the negative * key expires. * * If successful, 0 is returned, the authorisation token is revoked and anyone * waiting for the key is woken up. If the key was already instantiated, * -EBUSY will be returned. */ int key_reject_and_link(struct key *key, unsigned timeout, unsigned error, struct key *keyring, struct key *authkey) { struct assoc_array_edit *edit = NULL; int ret, awaken, link_ret = 0; key_check(key); key_check(keyring); awaken = 0; ret = -EBUSY; if (keyring) { if (keyring->restrict_link) return -EPERM; link_ret = __key_link_lock(keyring, &key->index_key); if (link_ret == 0) { link_ret = __key_link_begin(keyring, &key->index_key, &edit); if (link_ret < 0) __key_link_end(keyring, &key->index_key, edit); } } mutex_lock(&key_construction_mutex); /* can't instantiate twice */ if (key->state == KEY_IS_UNINSTANTIATED) { /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); mark_key_instantiated(key, -error); notify_key(key, NOTIFY_KEY_INSTANTIATED, -error); key_set_expiry(key, ktime_get_real_seconds() + timeout); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; ret = 0; /* and link it into the destination keyring */ if (keyring && link_ret == 0) __key_link(keyring, key, &edit); /* disable the authorisation key */ if (authkey) key_invalidate(authkey); } mutex_unlock(&key_construction_mutex); if (keyring && link_ret == 0) __key_link_end(keyring, &key->index_key, edit); /* wake up anyone waiting for a key to be constructed */ if (awaken) wake_up_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT); return ret == 0 ? link_ret : ret; } EXPORT_SYMBOL(key_reject_and_link); /** * key_put - Discard a reference to a key. * @key: The key to discard a reference from. * * Discard a reference to a key, and when all the references are gone, we * schedule the cleanup task to come and pull it out of the tree in process * context at some later time. */ void key_put(struct key *key) { if (key) { key_check(key); if (refcount_dec_and_test(&key->usage)) schedule_work(&key_gc_work); } } EXPORT_SYMBOL(key_put); /* * Find a key by its serial number. */ struct key *key_lookup(key_serial_t id) { struct rb_node *n; struct key *key; spin_lock(&key_serial_lock); /* search the tree for the specified key */ n = key_serial_tree.rb_node; while (n) { key = rb_entry(n, struct key, serial_node); if (id < key->serial) n = n->rb_left; else if (id > key->serial) n = n->rb_right; else goto found; } not_found: key = ERR_PTR(-ENOKEY); goto error; found: /* A key is allowed to be looked up only if someone still owns a * reference to it - otherwise it's awaiting the gc. */ if (!refcount_inc_not_zero(&key->usage)) goto not_found; error: spin_unlock(&key_serial_lock); return key; } EXPORT_SYMBOL(key_lookup); /* * Find and lock the specified key type against removal. * * We return with the sem read-locked if successful. If the type wasn't * available -ENOKEY is returned instead. */ struct key_type *key_type_lookup(const char *type) { struct key_type *ktype; down_read(&key_types_sem); /* look up the key type to see if it's one of the registered kernel * types */ list_for_each_entry(ktype, &key_types_list, link) { if (strcmp(ktype->name, type) == 0) goto found_kernel_type; } up_read(&key_types_sem); ktype = ERR_PTR(-ENOKEY); found_kernel_type: return ktype; } void key_set_timeout(struct key *key, unsigned timeout) { time64_t expiry = TIME64_MAX; /* make the changes with the locks held to prevent races */ down_write(&key->sem); if (timeout > 0) expiry = ktime_get_real_seconds() + timeout; key_set_expiry(key, expiry); up_write(&key->sem); } EXPORT_SYMBOL_GPL(key_set_timeout); /* * Unlock a key type locked by key_type_lookup(). */ void key_type_put(struct key_type *ktype) { up_read(&key_types_sem); } /* * Attempt to update an existing key. * * The key is given to us with an incremented refcount that we need to discard * if we get an error. */ static inline key_ref_t __key_update(key_ref_t key_ref, struct key_preparsed_payload *prep) { struct key *key = key_ref_to_ptr(key_ref); int ret; /* need write permission on the key to update it */ ret = key_permission(key_ref, KEY_NEED_WRITE); if (ret < 0) goto error; ret = -EEXIST; if (!key->type->update) goto error; down_write(&key->sem); ret = key->type->update(key, prep); if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_UPDATED, 0); } up_write(&key->sem); if (ret < 0) goto error; out: return key_ref; error: key_put(key); key_ref = ERR_PTR(ret); goto out; } /* * Create or potentially update a key. The combined logic behind * key_create_or_update() and key_create() */ static key_ref_t __key_create_or_update(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags, bool allow_update) { struct keyring_index_key index_key = { .description = description, }; struct key_preparsed_payload prep; struct assoc_array_edit *edit = NULL; const struct cred *cred = current_cred(); struct key *keyring, *key = NULL; key_ref_t key_ref; int ret; struct key_restriction *restrict_link = NULL; /* look up the key type to see if it's one of the registered kernel * types */ index_key.type = key_type_lookup(type); if (IS_ERR(index_key.type)) { key_ref = ERR_PTR(-ENODEV); goto error; } key_ref = ERR_PTR(-EINVAL); if (!index_key.type->instantiate || (!index_key.description && !index_key.type->preparse)) goto error_put_type; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (!(flags & KEY_ALLOC_BYPASS_RESTRICTION)) restrict_link = keyring->restrict_link; key_ref = ERR_PTR(-ENOTDIR); if (keyring->type != &key_type_keyring) goto error_put_type; memset(&prep, 0, sizeof(prep)); prep.orig_description = description; prep.data = payload; prep.datalen = plen; prep.quotalen = index_key.type->def_datalen; prep.expiry = TIME64_MAX; if (index_key.type->preparse) { ret = index_key.type->preparse(&prep); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_free_prep; } if (!index_key.description) index_key.description = prep.description; key_ref = ERR_PTR(-EINVAL); if (!index_key.description) goto error_free_prep; } index_key.desc_len = strlen(index_key.description); key_set_index_key(&index_key); ret = __key_link_lock(keyring, &index_key); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_free_prep; } ret = __key_link_begin(keyring, &index_key, &edit); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } if (restrict_link && restrict_link->check) { ret = restrict_link->check(keyring, index_key.type, &prep.payload, restrict_link->key); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } } /* if we're going to allocate a new key, we're going to have * to modify the keyring */ ret = key_permission(keyring_ref, KEY_NEED_WRITE); if (ret < 0) { key_ref = ERR_PTR(ret); goto error_link_end; } /* if it's requested and possible to update this type of key, search * for an existing key of the same type and description in the * destination keyring and update that instead if possible */ if (allow_update) { if (index_key.type->update) { key_ref = find_key_to_update(keyring_ref, &index_key); if (key_ref) goto found_matching_key; } } else { key_ref = find_key_to_update(keyring_ref, &index_key); if (key_ref) { key_ref_put(key_ref); key_ref = ERR_PTR(-EEXIST); goto error_link_end; } } /* if the client doesn't provide, decide on the permissions we want */ if (perm == KEY_PERM_UNDEF) { perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; if (index_key.type->read) perm |= KEY_POS_READ; if (index_key.type == &key_type_keyring || index_key.type->update) perm |= KEY_POS_WRITE; } /* allocate a new key */ key = key_alloc(index_key.type, index_key.description, cred->fsuid, cred->fsgid, cred, perm, flags, NULL); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error_link_end; } /* instantiate it and link it into the target keyring */ ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &edit); if (ret < 0) { key_put(key); key_ref = ERR_PTR(ret); goto error_link_end; } ima_post_key_create_or_update(keyring, key, payload, plen, flags, true); key_ref = make_key_ref(key, is_key_possessed(keyring_ref)); error_link_end: __key_link_end(keyring, &index_key, edit); error_free_prep: if (index_key.type->preparse) index_key.type->free_preparse(&prep); error_put_type: key_type_put(index_key.type); error: return key_ref; found_matching_key: /* we found a matching key, so we're going to try to update it * - we can drop the locks first as we have the key pinned */ __key_link_end(keyring, &index_key, edit); key = key_ref_to_ptr(key_ref); if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) { ret = wait_for_key_construction(key, true); if (ret < 0) { key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error_free_prep; } } key_ref = __key_update(key_ref, &prep); if (!IS_ERR(key_ref)) ima_post_key_create_or_update(keyring, key, payload, plen, flags, false); goto error_free_prep; } /** * key_create_or_update - Update or create and instantiate a key. * @keyring_ref: A pointer to the destination keyring with possession flag. * @type: The type of key. * @description: The searchable description for the key. * @payload: The data to use to instantiate or update the key. * @plen: The length of @payload. * @perm: The permissions mask for a new key. * @flags: The quota flags for a new key. * * Search the destination keyring for a key of the same description and if one * is found, update it, otherwise create and instantiate a new one and create a * link to it from that keyring. * * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be * concocted. * * Returns a pointer to the new key if successful, -ENODEV if the key type * wasn't available, -ENOTDIR if the keyring wasn't a keyring, -EACCES if the * caller isn't permitted to modify the keyring or the LSM did not permit * creation of the key. * * On success, the possession flag from the keyring ref will be tacked on to * the key ref before it is returned. */ key_ref_t key_create_or_update(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags) { return __key_create_or_update(keyring_ref, type, description, payload, plen, perm, flags, true); } EXPORT_SYMBOL(key_create_or_update); /** * key_create - Create and instantiate a key. * @keyring_ref: A pointer to the destination keyring with possession flag. * @type: The type of key. * @description: The searchable description for the key. * @payload: The data to use to instantiate or update the key. * @plen: The length of @payload. * @perm: The permissions mask for a new key. * @flags: The quota flags for a new key. * * Create and instantiate a new key and link to it from the destination keyring. * * If perm is KEY_PERM_UNDEF then an appropriate key permissions mask will be * concocted. * * Returns a pointer to the new key if successful, -EEXIST if a key with the * same description already exists, -ENODEV if the key type wasn't available, * -ENOTDIR if the keyring wasn't a keyring, -EACCES if the caller isn't * permitted to modify the keyring or the LSM did not permit creation of the * key. * * On success, the possession flag from the keyring ref will be tacked on to * the key ref before it is returned. */ key_ref_t key_create(key_ref_t keyring_ref, const char *type, const char *description, const void *payload, size_t plen, key_perm_t perm, unsigned long flags) { return __key_create_or_update(keyring_ref, type, description, payload, plen, perm, flags, false); } EXPORT_SYMBOL(key_create); /** * key_update - Update a key's contents. * @key_ref: The pointer (plus possession flag) to the key. * @payload: The data to be used to update the key. * @plen: The length of @payload. * * Attempt to update the contents of a key with the given payload data. The * caller must be granted Write permission on the key. Negative keys can be * instantiated by this method. * * Returns 0 on success, -EACCES if not permitted and -EOPNOTSUPP if the key * type does not support updating. The key type may return other errors. */ int key_update(key_ref_t key_ref, const void *payload, size_t plen) { struct key_preparsed_payload prep; struct key *key = key_ref_to_ptr(key_ref); int ret; key_check(key); /* the key must be writable */ ret = key_permission(key_ref, KEY_NEED_WRITE); if (ret < 0) return ret; /* attempt to update it if supported */ if (!key->type->update) return -EOPNOTSUPP; memset(&prep, 0, sizeof(prep)); prep.data = payload; prep.datalen = plen; prep.quotalen = key->type->def_datalen; prep.expiry = TIME64_MAX; if (key->type->preparse) { ret = key->type->preparse(&prep); if (ret < 0) goto error; } down_write(&key->sem); ret = key->type->update(key, &prep); if (ret == 0) { /* Updating a negative key positively instantiates it */ mark_key_instantiated(key, 0); notify_key(key, NOTIFY_KEY_UPDATED, 0); } up_write(&key->sem); error: if (key->type->preparse) key->type->free_preparse(&prep); return ret; } EXPORT_SYMBOL(key_update); /** * key_revoke - Revoke a key. * @key: The key to be revoked. * * Mark a key as being revoked and ask the type to free up its resources. The * revocation timeout is set and the key and all its links will be * automatically garbage collected after key_gc_delay amount of time if they * are not manually dealt with first. */ void key_revoke(struct key *key) { time64_t time; key_check(key); /* make sure no one's trying to change or use the key when we mark it * - we tell lockdep that we might nest because we might be revoking an * authorisation key whilst holding the sem on a key we've just * instantiated */ down_write_nested(&key->sem, 1); if (!test_and_set_bit(KEY_FLAG_REVOKED, &key->flags)) { notify_key(key, NOTIFY_KEY_REVOKED, 0); if (key->type->revoke) key->type->revoke(key); /* set the death time to no more than the expiry time */ time = ktime_get_real_seconds(); if (key->revoked_at == 0 || key->revoked_at > time) { key->revoked_at = time; key_schedule_gc(key->revoked_at + key_gc_delay); } } up_write(&key->sem); } EXPORT_SYMBOL(key_revoke); /** * key_invalidate - Invalidate a key. * @key: The key to be invalidated. * * Mark a key as being invalidated and have it cleaned up immediately. The key * is ignored by all searches and other operations from this point. */ void key_invalidate(struct key *key) { kenter("%d", key_serial(key)); key_check(key); if (!test_bit(KEY_FLAG_INVALIDATED, &key->flags)) { down_write_nested(&key->sem, 1); if (!test_and_set_bit(KEY_FLAG_INVALIDATED, &key->flags)) { notify_key(key, NOTIFY_KEY_INVALIDATED, 0); key_schedule_gc_links(); } up_write(&key->sem); } } EXPORT_SYMBOL(key_invalidate); /** * generic_key_instantiate - Simple instantiation of a key from preparsed data * @key: The key to be instantiated * @prep: The preparsed data to load. * * Instantiate a key from preparsed data. We assume we can just copy the data * in directly and clear the old pointers. * * This can be pointed to directly by the key type instantiate op pointer. */ int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { int ret; pr_devel("==>%s()\n", __func__); ret = key_payload_reserve(key, prep->quotalen); if (ret == 0) { rcu_assign_keypointer(key, prep->payload.data[0]); key->payload.data[1] = prep->payload.data[1]; key->payload.data[2] = prep->payload.data[2]; key->payload.data[3] = prep->payload.data[3]; prep->payload.data[0] = NULL; prep->payload.data[1] = NULL; prep->payload.data[2] = NULL; prep->payload.data[3] = NULL; } pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL(generic_key_instantiate); /** * register_key_type - Register a type of key. * @ktype: The new key type. * * Register a new key type. * * Returns 0 on success or -EEXIST if a type of this name already exists. */ int register_key_type(struct key_type *ktype) { struct key_type *p; int ret; memset(&ktype->lock_class, 0, sizeof(ktype->lock_class)); ret = -EEXIST; down_write(&key_types_sem); /* disallow key types with the same name */ list_for_each_entry(p, &key_types_list, link) { if (strcmp(p->name, ktype->name) == 0) goto out; } /* store the type */ list_add(&ktype->link, &key_types_list); pr_notice("Key type %s registered\n", ktype->name); ret = 0; out: up_write(&key_types_sem); return ret; } EXPORT_SYMBOL(register_key_type); /** * unregister_key_type - Unregister a type of key. * @ktype: The key type. * * Unregister a key type and mark all the extant keys of this type as dead. * Those keys of this type are then destroyed to get rid of their payloads and * they and their links will be garbage collected as soon as possible. */ void unregister_key_type(struct key_type *ktype) { down_write(&key_types_sem); list_del_init(&ktype->link); downgrade_write(&key_types_sem); key_gc_keytype(ktype); pr_notice("Key type %s unregistered\n", ktype->name); up_read(&key_types_sem); } EXPORT_SYMBOL(unregister_key_type); /* * Initialise the key management state. */ void __init key_init(void) { /* allocate a slab in which we can store keys */ key_jar = kmem_cache_create("key_jar", sizeof(struct key), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); /* add the special key types */ list_add_tail(&key_type_keyring.link, &key_types_list); list_add_tail(&key_type_dead.link, &key_types_list); list_add_tail(&key_type_user.link, &key_types_list); list_add_tail(&key_type_logon.link, &key_types_list); /* record the root user tracking */ rb_link_node(&root_key_user.node, NULL, &key_user_tree.rb_node); rb_insert_color(&root_key_user.node, &key_user_tree); } |
| 24 133 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for diskquota-operations. When diskquota is configured these * macros expand to the right source-code. * * Author: Marco van Wieringen <mvw@planets.elm.net> */ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ #include <linux/fs.h> #define DQUOT_SPACE_WARN 0x1 #define DQUOT_SPACE_RESERVE 0x2 #define DQUOT_SPACE_NOFAIL 0x4 static inline struct quota_info *sb_dqopt(struct super_block *sb) { return &sb->s_dquot; } /* i_mutex must being held */ static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { return ((ia->ia_valid & ATTR_SIZE) || i_uid_needs_update(idmap, ia, inode) || i_gid_needs_update(idmap, ia, inode)); } #if defined(CONFIG_QUOTA) #define quota_error(sb, fmt, args...) \ __quota_error((sb), __func__, fmt , ## args) extern __printf(3, 4) void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...); /* * declaration of quota_function calls in kernel. */ int dquot_initialize(struct inode *inode); bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); static inline struct dquot *dqgrab(struct dquot *dquot) { /* Make sure someone else has active reference to dquot */ WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); atomic_inc(&dquot->dq_count); return dquot; } static inline bool dquot_is_busy(struct dquot *dquot) { if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return true; if (atomic_read(&dquot->dq_count) > 0) return true; return false; } void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv); struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags); void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); void dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); int dquot_disable(struct super_block *sb, int type, unsigned int flags); /* Suspend quotas on remount RO */ static inline int dquot_suspend(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_SUSPENDED); } int dquot_resume(struct super_block *sb, int type); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_get_next_id(struct super_block *sb, struct kqid *qid); int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int dquot_quota_off(struct super_block *sb, int type); int dquot_writeback_dquots(struct super_block *sb, int type); int dquot_quota_sync(struct super_block *sb, int type); int dquot_get_state(struct super_block *sb, struct qc_state *state); int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii); int dquot_get_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id, struct qc_dqblk *di); int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) { return sb_dqopt(sb)->info + type; } /* * Functions for checking status of quota */ static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } static inline unsigned sb_any_quota_suspended(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED); } /* Does kernel know about any quota information for given sb + type? */ static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } static inline unsigned sb_any_quota_loaded(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED); } static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } /* * Operations supported for diskquotas. */ extern const struct dquot_operations dquot_operations; extern const struct quotactl_ops dquot_quotactl_sysfile_ops; #else static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_suspended(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_suspended(struct super_block *sb) { return 0; } /* Does kernel know about any quota information for given sb + type? */ static inline int sb_has_quota_loaded(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_loaded(struct super_block *sb) { return 0; } static inline int sb_has_quota_active(struct super_block *sb, int type) { return 0; } static inline int dquot_initialize(struct inode *inode) { return 0; } static inline bool dquot_initialize_needed(struct inode *inode) { return false; } static inline void dquot_drop(struct inode *inode) { } static inline int dquot_alloc_inode(struct inode *inode) { return 0; } static inline void dquot_free_inode(struct inode *inode) { } static inline int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { return 0; } static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_add_bytes(inode, number); return 0; } static inline void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_sub_bytes(inode, number); } static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); } static inline int dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { inode_sub_bytes(inode, number); return 0; } static inline int dquot_disable(struct super_block *sb, int type, unsigned int flags) { return 0; } static inline int dquot_suspend(struct super_block *sb, int type) { return 0; } static inline int dquot_resume(struct super_block *sb, int type) { return 0; } #define dquot_file_open generic_file_open static inline int dquot_writeback_dquots(struct super_block *sb, int type) { return 0; } #endif /* CONFIG_QUOTA */ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN); } static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr) { __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL); mark_inode_dirty_sync(inode); } static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { int ret; ret = dquot_alloc_space_nodirty(inode, nr); if (!ret) { /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly * reduces lock contention. */ mark_inode_dirty(inode); } return ret; } static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr) { dquot_alloc_space_nofail(inode, nr << inode->i_blkbits); } static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { return dquot_alloc_space(inode, nr << inode->i_blkbits); } static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0); } static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_prealloc_block_nodirty(inode, nr); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } static inline void dquot_claim_block(struct inode *inode, qsize_t nr) { dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) { dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr, 0); } static inline void dquot_free_space(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr); mark_inode_dirty_sync(inode); } static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_free_block(struct inode *inode, qsize_t nr) { dquot_free_space(inode, nr << inode->i_blkbits); } static inline void dquot_release_reservation_block(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE); } unsigned int qtype_enforce_flag(int type); #endif /* _LINUX_QUOTAOPS_ */ |
| 76 136 136 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/backing-dev.h * * low-level device information and state which is propagated up through * to high-level code. */ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/device.h> #include <linux/writeback.h> #include <linux/backing-dev-defs.h> #include <linux/slab.h> static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); return bdi; } struct backing_dev_info *bdi_get_by_id(u64 id); void bdi_put(struct backing_dev_info *bdi); __printf(2, 3) int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); __printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); void wb_wait_for_completion(struct wb_completion *done); extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { return test_bit(WB_has_dirty_io, &wb->state); } static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) { /* * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are * any dirty wbs. See wb_update_write_bandwidth(). */ return atomic_long_read(&bdi->tot_write_bandwidth); } static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); } static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_sum_positive(&wb->stat[item]); } extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif } /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) #define BDI_CAP_WRITEBACK_ACCT (1 << 1) #define BDI_CAP_STRICTLIMIT (1 << 2) extern struct backing_dev_info noop_backing_dev_info; int bdi_init(struct backing_dev_info *bdi); /** * writeback_in_progress - determine whether there is writeback in progress * @wb: bdi_writeback of interest * * Determine whether there is writeback waiting to be handled against a * bdi_writeback. */ static inline bool writeback_in_progress(struct bdi_writeback *wb) { return test_bit(WB_writeback_running, &wb->state); } struct backing_dev_info *inode_to_bdi(struct inode *inode); static inline bool mapping_can_writeback(struct address_space *mapping) { return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct cgroup_subsys_state *css); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest * * Cgroup writeback requires support from the filesystem. Also, both memcg and * iocg have to be on the default hierarchy. Test whether all conditions are * met. * * Note that the test result may change dynamically on the same inode * depending on how memcg and iocg are configured. */ static inline bool inode_cgwb_enabled(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); return cgroup_subsys_on_dfl(memory_cgrp_subsys) && cgroup_subsys_on_dfl(io_cgrp_subsys) && (bdi->capabilities & BDI_CAP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest * * Find the wb of @bdi which matches both the memcg and blkcg of %current. * Must be called under rcu_read_lock() which protects the returend wb. * NULL if not found. */ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; memcg_css = task_css(current, memory_cgrp_id); if (!memcg_css->parent) return &bdi->wb; wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); /* * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } /** * wb_get_create_current - get or create wb for %current on a bdi * @bdi: bdi of interest * @gfp: allocation mask * * Equivalent to wb_get_create() on %current's memcg. This function is * called from a relatively hot path and optimizes the common cases using * wb_find_current(). */ static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { struct bdi_writeback *wb; rcu_read_lock(); wb = wb_find_current(bdi); if (wb && unlikely(!wb_tryget(wb))) wb = NULL; rcu_read_unlock(); if (unlikely(!wb)) { struct cgroup_subsys_state *memcg_css; memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, gfp); css_put(memcg_css); } return wb; } /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { /* * If wbc does not have inode attached, it means cgroup writeback was * disabled when wbc started. Just use the default wb in that case. */ return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; } /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and * can't sleep during the transaction. IRQs may or may not be disabled on * return. */ static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); /* * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode * @cookie: @cookie from unlocked_inode_to_wb_begin() */ static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { if (unlikely(cookie->locked)) xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) { return false; } static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; } static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { return &bdi->wb; } static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { return inode_to_wb(inode); } static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { } static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { } #endif /* CONFIG_CGROUP_WRITEBACK */ const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ |
| 42 42 42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Kernel Electric-Fence (KFENCE). For more info please see * Documentation/dev-tools/kfence.rst. * * Copyright (C) 2020, Google LLC. */ #ifndef MM_KFENCE_KFENCE_H #define MM_KFENCE_KFENCE_H #include <linux/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include "../slab.h" /* for struct kmem_cache */ /* * Get the canary byte pattern for @addr. Use a pattern that varies based on the * lower 3 bits of the address, to detect memory corruptions with higher * probability, where similar constants are used. */ #define KFENCE_CANARY_PATTERN_U8(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7)) /* * Define a continuous 8-byte canary starting from a multiple of 8. The canary * of each byte is only related to the lowest three bits of its address, so the * canary of every 8 bytes is the same. 64-bit memory can be filled and checked * at a time instead of byte by byte to improve performance. */ #define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(le64_to_cpu(0x0706050403020100))) /* Maximum stack depth for reports. */ #define KFENCE_STACK_DEPTH 64 /* KFENCE object states. */ enum kfence_object_state { KFENCE_OBJECT_UNUSED, /* Object is unused. */ KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ }; /* Alloc/free tracking information. */ struct kfence_track { pid_t pid; int cpu; u64 ts_nsec; int num_stack_entries; unsigned long stack_entries[KFENCE_STACK_DEPTH]; }; /* KFENCE metadata per guarded allocation. */ struct kfence_metadata { struct list_head list; /* Freelist node; access under kfence_freelist_lock. */ struct rcu_head rcu_head; /* For delayed freeing. */ /* * Lock protecting below data; to ensure consistency of the below data, * since the following may execute concurrently: __kfence_alloc(), * __kfence_free(), kfence_handle_page_fault(). However, note that we * cannot grab the same metadata off the freelist twice, and multiple * __kfence_alloc() cannot run concurrently on the same metadata. */ raw_spinlock_t lock; /* The current state of the object; see above. */ enum kfence_object_state state; /* * Allocated object address; cannot be calculated from size, because of * alignment requirements. * * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant. */ unsigned long addr; /* * The size of the original allocation. */ size_t size; /* * The kmem_cache cache of the last allocation; NULL if never allocated * or the cache has already been destroyed. */ struct kmem_cache *cache; /* * In case of an invalid access, the page that was unprotected; we * optimistically only store one address. */ unsigned long unprotected_page; /* Allocation and free stack information. */ struct kfence_track alloc_track; struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; #ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif }; #define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \ CONFIG_KFENCE_NUM_OBJECTS) extern struct kfence_metadata *kfence_metadata; static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) { long index; /* The checks do not affect performance; only called from slow-paths. */ if (!is_kfence_address((void *)addr)) return NULL; /* * May be an invalid index if called with an address at the edge of * __kfence_pool, in which case we would report an "invalid access" * error. */ index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) return NULL; return &kfence_metadata[index]; } /* KFENCE error types for report generation. */ enum kfence_error_type { KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ KFENCE_ERROR_UAF, /* Detected a use-after-free access. */ KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */ KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */ KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, const struct kfence_metadata *meta, enum kfence_error_type type); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); #endif /* MM_KFENCE_KFENCE_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GSO_H #define _NET_GSO_H #include <linux/skbuff.h> /* Keeps track of mac header offset relative to skb->head. * It is useful for TSO of Tunneling protocol. e.g. GRE. * For non-tunnel skb it points to skb_mac_header() and for * tunnel skb it points to outer mac header. * Keeps track of level of encapsulation of network headers. */ struct skb_gso_cb { union { int mac_offset; int data_offset; }; int encap_level; __wsum csum; __u16 csum_start; }; #define SKB_GSO_CB_OFFSET 32 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) { return (skb_mac_header(inner_skb) - inner_skb->head) - SKB_GSO_CB(inner_skb)->mac_offset; } static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) { int new_headroom, headroom; int ret; headroom = skb_headroom(skb); ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); if (ret) return ret; new_headroom = skb_headroom(skb); SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); return 0; } static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) { /* Do not update partial checksums if remote checksum is enabled. */ if (skb->remcsum_offload) return; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; } /* Compute the checksum for a gso segment. First compute the checksum value * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and * then add in skb->csum (checksum from csum_start to end of packet). * skb->csum and csum_start are then updated to reflect the checksum of the * resultant packet starting from the transport header-- the resultant checksum * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo * header. */ static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { unsigned char *csum_start = skb_transport_header(skb); int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; __wsum partial = SKB_GSO_CB(skb)->csum; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; return csum_fold(csum_partial(csum_start, plen, partial)); } struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { return __skb_gso_segment(skb, features, true); } struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) { skb->protocol = protocol; skb->encapsulation = 1; skb_push(skb, pulled_hlen); skb_reset_transport_header(skb); skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } #endif /* _NET_GSO_H */ |
| 1113 1115 1115 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/audit.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /** * tomoyo_print_bprm - Print "struct linux_binprm" for auditing. * * @bprm: Pointer to "struct linux_binprm". * @dump: Pointer to "struct tomoyo_page_dump". * * Returns the contents of @bprm on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ static char *tomoyo_print_bprm(struct linux_binprm *bprm, struct tomoyo_page_dump *dump) { static const int tomoyo_buffer_len = 4096 * 2; char *buffer = kzalloc(tomoyo_buffer_len, GFP_NOFS); char *cp; char *last_start; int len; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool truncated = false; if (!buffer) return NULL; len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ "); cp = buffer + len; if (!argv_count) { memmove(cp, "} envp[]={ ", 11); cp += 11; } last_start = cp; while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) goto out; pos += PAGE_SIZE - offset; /* Read. */ while (offset < PAGE_SIZE) { const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (cp == last_start) *cp++ = '"'; if (cp >= buffer + tomoyo_buffer_len - 32) { /* Reserve some room for "..." string. */ truncated = true; } else if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else if (!c) { *cp++ = '"'; *cp++ = ' '; last_start = cp; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } if (c) continue; if (argv_count) { if (--argv_count == 0) { if (truncated) { cp = last_start; memmove(cp, "... ", 4); cp += 4; } memmove(cp, "} envp[]={ ", 11); cp += 11; last_start = cp; truncated = false; } } else if (envp_count) { if (--envp_count == 0) { if (truncated) { cp = last_start; memmove(cp, "... ", 4); cp += 4; } } } if (!argv_count && !envp_count) break; } offset = 0; } *cp++ = '}'; *cp = '\0'; return buffer; out: snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ... } envp[]= { ... }"); return buffer; } /** * tomoyo_filetype - Get string representation of file type. * * @mode: Mode value for stat(). * * Returns file type string. */ static inline const char *tomoyo_filetype(const umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case 0: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FILE]; case S_IFDIR: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_DIRECTORY]; case S_IFLNK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SYMLINK]; case S_IFIFO: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FIFO]; case S_IFSOCK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SOCKET]; case S_IFBLK: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_BLOCK_DEV]; case S_IFCHR: return tomoyo_condition_keyword[TOMOYO_TYPE_IS_CHAR_DEV]; } return "unknown"; /* This should not happen. */ } /** * tomoyo_print_header - Get header line of audit log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns string representation. * * This function uses kmalloc(), so caller must kfree() if this function * didn't return NULL. */ static char *tomoyo_print_header(struct tomoyo_request_info *r) { struct tomoyo_time stamp; const pid_t gpid = task_pid_nr(current); struct tomoyo_obj_info *obj = r->obj; static const int tomoyo_buffer_len = 4096; char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS); int pos; u8 i; if (!buffer) return NULL; tomoyo_convert_time(ktime_get_real_seconds(), &stamp); pos = snprintf(buffer, tomoyo_buffer_len - 1, "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }", stamp.year, stamp.month, stamp.day, stamp.hour, stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode], str_yes_no(r->granted), gpid, tomoyo_sys_getpid(), tomoyo_sys_getppid(), from_kuid(&init_user_ns, current_uid()), from_kgid(&init_user_ns, current_gid()), from_kuid(&init_user_ns, current_euid()), from_kgid(&init_user_ns, current_egid()), from_kuid(&init_user_ns, current_suid()), from_kgid(&init_user_ns, current_sgid()), from_kuid(&init_user_ns, current_fsuid()), from_kgid(&init_user_ns, current_fsgid())); if (!obj) goto no_obj_info; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct tomoyo_mini_stat *stat; unsigned int dev; umode_t mode; if (!obj->stat_valid[i]) continue; stat = &obj->stat[i]; dev = stat->dev; mode = stat->mode; if (i & 1) { pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }", (i >> 1) + 1, from_kuid(&init_user_ns, stat->uid), from_kgid(&init_user_ns, stat->gid), (unsigned long)stat->ino, stat->mode & S_IALLUGO); continue; } pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s", (i >> 1) + 1, from_kuid(&init_user_ns, stat->uid), from_kgid(&init_user_ns, stat->gid), (unsigned long)stat->ino, MAJOR(dev), MINOR(dev), mode & S_IALLUGO, tomoyo_filetype(mode)); if (S_ISCHR(mode) || S_ISBLK(mode)) { dev = stat->rdev; pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " dev_major=%u dev_minor=%u", MAJOR(dev), MINOR(dev)); } pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos, " }"); } no_obj_info: if (pos < tomoyo_buffer_len - 1) return buffer; kfree(buffer); return NULL; } /** * tomoyo_init_log - Allocate buffer for audit logs. * * @r: Pointer to "struct tomoyo_request_info". * @len: Buffer size needed for @fmt and @args. * @fmt: The printf()'s format string. * @args: va_list structure for @fmt. * * Returns pointer to allocated memory. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) { char *buf = NULL; char *bprm_info = NULL; const char *header = NULL; char *realpath = NULL; const char *symlink = NULL; int pos; const char *domainname = r->domain->domainname->name; header = tomoyo_print_header(r); if (!header) return NULL; /* +10 is for '\n' etc. and '\0'. */ len += strlen(domainname) + strlen(header) + 10; if (r->ee) { struct file *file = r->ee->bprm->file; realpath = tomoyo_realpath_from_path(&file->f_path); bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump); if (!realpath || !bprm_info) goto out; /* +80 is for " exec={ realpath=\"%s\" argc=%d envc=%d %s }" */ len += strlen(realpath) + 80 + strlen(bprm_info); } else if (r->obj && r->obj->symlink_target) { symlink = r->obj->symlink_target->name; /* +18 is for " symlink.target=\"%s\"" */ len += 18 + strlen(symlink); } len = kmalloc_size_roundup(len); buf = kzalloc(len, GFP_NOFS); if (!buf) goto out; len--; pos = snprintf(buf, len, "%s", header); if (realpath) { struct linux_binprm *bprm = r->ee->bprm; pos += snprintf(buf + pos, len - pos, " exec={ realpath=\"%s\" argc=%d envc=%d %s }", realpath, bprm->argc, bprm->envc, bprm_info); } else if (symlink) pos += snprintf(buf + pos, len - pos, " symlink.target=\"%s\"", symlink); pos += snprintf(buf + pos, len - pos, "\n%s\n", domainname); vsnprintf(buf + pos, len - pos, fmt, args); out: kfree(realpath); kfree(bprm_info); kfree(header); return buf; } /* Wait queue for /sys/kernel/security/tomoyo/audit. */ static DECLARE_WAIT_QUEUE_HEAD(tomoyo_log_wait); /* Structure for audit log. */ struct tomoyo_log { struct list_head list; char *log; int size; }; /* The list for "struct tomoyo_log". */ static LIST_HEAD(tomoyo_log); /* Lock for "struct list_head tomoyo_log". */ static DEFINE_SPINLOCK(tomoyo_log_lock); /* Length of "struct list_head tomoyo_log". */ static unsigned int tomoyo_log_count; /** * tomoyo_get_audit - Get audit mode. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number. * @index: Index number of functionality. * @matched_acl: Pointer to "struct tomoyo_acl_info". * @is_granted: True if granted log, false otherwise. * * Returns true if this request should be audited, false otherwise. */ static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index, const struct tomoyo_acl_info *matched_acl, const bool is_granted) { u8 mode; const u8 category = tomoyo_index2category[index] + TOMOYO_MAX_MAC_INDEX; struct tomoyo_profile *p; if (!tomoyo_policy_loaded) return false; p = tomoyo_profile(ns, profile); if (tomoyo_log_count >= p->pref[TOMOYO_PREF_MAX_AUDIT_LOG]) return false; if (is_granted && matched_acl && matched_acl->cond && matched_acl->cond->grant_log != TOMOYO_GRANTLOG_AUTO) return matched_acl->cond->grant_log == TOMOYO_GRANTLOG_YES; mode = p->config[index]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->config[category]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->default_config; if (is_granted) return mode & TOMOYO_CONFIG_WANT_GRANT_LOG; return mode & TOMOYO_CONFIG_WANT_REJECT_LOG; } /** * tomoyo_write_log2 - Write an audit log. * * @r: Pointer to "struct tomoyo_request_info". * @len: Buffer size needed for @fmt and @args. * @fmt: The printf()'s format string. * @args: va_list structure for @fmt. * * Returns nothing. */ void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) { char *buf; struct tomoyo_log *entry; bool quota_exceeded = false; if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type, r->matched_acl, r->granted)) goto out; buf = tomoyo_init_log(r, len, fmt, args); if (!buf) goto out; entry = kzalloc(sizeof(*entry), GFP_NOFS); if (!entry) { kfree(buf); goto out; } entry->log = buf; len = kmalloc_size_roundup(strlen(buf) + 1); /* * The entry->size is used for memory quota checks. * Don't go beyond strlen(entry->log). */ entry->size = len + kmalloc_size_roundup(sizeof(*entry)); spin_lock(&tomoyo_log_lock); if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] && tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >= tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT]) { quota_exceeded = true; } else { tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] += entry->size; list_add_tail(&entry->list, &tomoyo_log); tomoyo_log_count++; } spin_unlock(&tomoyo_log_lock); if (quota_exceeded) { kfree(buf); kfree(entry); goto out; } wake_up(&tomoyo_log_wait); out: return; } /** * tomoyo_write_log - Write an audit log. * * @r: Pointer to "struct tomoyo_request_info". * @fmt: The printf()'s format string, followed by parameters. * * Returns nothing. */ void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...) { va_list args; int len; va_start(args, fmt); len = vsnprintf(NULL, 0, fmt, args) + 1; va_end(args); va_start(args, fmt); tomoyo_write_log2(r, len, fmt, args); va_end(args); } /** * tomoyo_read_log - Read an audit log. * * @head: Pointer to "struct tomoyo_io_buffer". * * Returns nothing. */ void tomoyo_read_log(struct tomoyo_io_buffer *head) { struct tomoyo_log *ptr = NULL; if (head->r.w_pos) return; kfree(head->read_buf); head->read_buf = NULL; spin_lock(&tomoyo_log_lock); if (!list_empty(&tomoyo_log)) { ptr = list_entry(tomoyo_log.next, typeof(*ptr), list); list_del(&ptr->list); tomoyo_log_count--; tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] -= ptr->size; } spin_unlock(&tomoyo_log_lock); if (ptr) { head->read_buf = ptr->log; head->r.w[head->r.w_pos++] = head->read_buf; kfree(ptr); } } /** * tomoyo_poll_log - Wait for an audit log. * * @file: Pointer to "struct file". * @wait: Pointer to "poll_table". Maybe NULL. * * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log. */ __poll_t tomoyo_poll_log(struct file *file, poll_table *wait) { if (tomoyo_log_count) return EPOLLIN | EPOLLRDNORM; poll_wait(file, &tomoyo_log_wait, wait); if (tomoyo_log_count) return EPOLLIN | EPOLLRDNORM; return 0; } |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/scatterlist.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/usb.h> #define SIMPLE_IO_TIMEOUT 10000 /* in milliseconds */ /*-------------------------------------------------------------------------*/ static int override_alt = -1; module_param_named(alt, override_alt, int, 0644); MODULE_PARM_DESC(alt, ">= 0 to override altsetting selection"); static void complicated_callback(struct urb *urb); /*-------------------------------------------------------------------------*/ /* FIXME make these public somewhere; usbdevfs.h? */ /* Parameter for usbtest driver. */ struct usbtest_param_32 { /* inputs */ __u32 test_num; /* 0..(TEST_CASES-1) */ __u32 iterations; __u32 length; __u32 vary; __u32 sglen; /* outputs */ __s32 duration_sec; __s32 duration_usec; }; /* * Compat parameter to the usbtest driver. * This supports older user space binaries compiled with 64 bit compiler. */ struct usbtest_param_64 { /* inputs */ __u32 test_num; /* 0..(TEST_CASES-1) */ __u32 iterations; __u32 length; __u32 vary; __u32 sglen; /* outputs */ __s64 duration_sec; __s64 duration_usec; }; /* IOCTL interface to the driver. */ #define USBTEST_REQUEST_32 _IOWR('U', 100, struct usbtest_param_32) /* COMPAT IOCTL interface to the driver. */ #define USBTEST_REQUEST_64 _IOWR('U', 100, struct usbtest_param_64) /*-------------------------------------------------------------------------*/ #define GENERIC /* let probe() bind using module params */ /* Some devices that can be used for testing will have "real" drivers. * Entries for those need to be enabled here by hand, after disabling * that "real" driver. */ //#define IBOT2 /* grab iBOT2 webcams */ //#define KEYSPAN_19Qi /* grab un-renumerated serial adapter */ /*-------------------------------------------------------------------------*/ struct usbtest_info { const char *name; u8 ep_in; /* bulk/intr source */ u8 ep_out; /* bulk/intr sink */ unsigned autoconf:1; unsigned ctrl_out:1; unsigned iso:1; /* try iso in/out */ unsigned intr:1; /* try interrupt in/out */ int alt; }; /* this is accessed only through usbfs ioctl calls. * one ioctl to issue a test ... one lock per device. * tests create other threads if they need them. * urbs and buffers are allocated dynamically, * and data generated deterministically. */ struct usbtest_dev { struct usb_interface *intf; struct usbtest_info *info; int in_pipe; int out_pipe; int in_iso_pipe; int out_iso_pipe; int in_int_pipe; int out_int_pipe; struct usb_endpoint_descriptor *iso_in, *iso_out; struct usb_endpoint_descriptor *int_in, *int_out; struct mutex lock; #define TBUF_SIZE 256 u8 *buf; }; static struct usb_device *testdev_to_usbdev(struct usbtest_dev *test) { return interface_to_usbdev(test->intf); } /* set up all urbs so they can be used with either bulk or interrupt */ #define INTERRUPT_RATE 1 /* msec/transfer */ #define ERROR(tdev, fmt, args...) \ dev_err(&(tdev)->intf->dev , fmt , ## args) #define WARNING(tdev, fmt, args...) \ dev_warn(&(tdev)->intf->dev , fmt , ## args) #define GUARD_BYTE 0xA5 #define MAX_SGLEN 128 /*-------------------------------------------------------------------------*/ static inline void endpoint_update(int edi, struct usb_host_endpoint **in, struct usb_host_endpoint **out, struct usb_host_endpoint *e) { if (edi) { if (!*in) *in = e; } else { if (!*out) *out = e; } } static int get_endpoints(struct usbtest_dev *dev, struct usb_interface *intf) { int tmp; struct usb_host_interface *alt; struct usb_host_endpoint *in, *out; struct usb_host_endpoint *iso_in, *iso_out; struct usb_host_endpoint *int_in, *int_out; struct usb_device *udev; for (tmp = 0; tmp < intf->num_altsetting; tmp++) { unsigned ep; in = out = NULL; iso_in = iso_out = NULL; int_in = int_out = NULL; alt = intf->altsetting + tmp; if (override_alt >= 0 && override_alt != alt->desc.bAlternateSetting) continue; /* take the first altsetting with in-bulk + out-bulk; * ignore other endpoints and altsettings. */ for (ep = 0; ep < alt->desc.bNumEndpoints; ep++) { struct usb_host_endpoint *e; int edi; e = alt->endpoint + ep; edi = usb_endpoint_dir_in(&e->desc); switch (usb_endpoint_type(&e->desc)) { case USB_ENDPOINT_XFER_BULK: endpoint_update(edi, &in, &out, e); continue; case USB_ENDPOINT_XFER_INT: if (dev->info->intr) endpoint_update(edi, &int_in, &int_out, e); continue; case USB_ENDPOINT_XFER_ISOC: if (dev->info->iso) endpoint_update(edi, &iso_in, &iso_out, e); fallthrough; default: continue; } } if ((in && out) || iso_in || iso_out || int_in || int_out) goto found; } return -EINVAL; found: udev = testdev_to_usbdev(dev); dev->info->alt = alt->desc.bAlternateSetting; if (alt->desc.bAlternateSetting != 0) { tmp = usb_set_interface(udev, alt->desc.bInterfaceNumber, alt->desc.bAlternateSetting); if (tmp < 0) return tmp; } if (in) dev->in_pipe = usb_rcvbulkpipe(udev, in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); if (out) dev->out_pipe = usb_sndbulkpipe(udev, out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); if (iso_in) { dev->iso_in = &iso_in->desc; dev->in_iso_pipe = usb_rcvisocpipe(udev, iso_in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } if (iso_out) { dev->iso_out = &iso_out->desc; dev->out_iso_pipe = usb_sndisocpipe(udev, iso_out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } if (int_in) { dev->int_in = &int_in->desc; dev->in_int_pipe = usb_rcvintpipe(udev, int_in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } if (int_out) { dev->int_out = &int_out->desc; dev->out_int_pipe = usb_sndintpipe(udev, int_out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); } return 0; } /*-------------------------------------------------------------------------*/ /* Support for testing basic non-queued I/O streams. * * These just package urbs as requests that can be easily canceled. * Each urb's data buffer is dynamically allocated; callers can fill * them with non-zero test data (or test for it) when appropriate. */ static void simple_callback(struct urb *urb) { complete(urb->context); } static struct urb *usbtest_alloc_urb( struct usb_device *udev, int pipe, unsigned long bytes, unsigned transfer_flags, unsigned offset, u8 bInterval, usb_complete_t complete_fn) { struct urb *urb; urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) return urb; if (bInterval) usb_fill_int_urb(urb, udev, pipe, NULL, bytes, complete_fn, NULL, bInterval); else usb_fill_bulk_urb(urb, udev, pipe, NULL, bytes, complete_fn, NULL); urb->interval = (udev->speed == USB_SPEED_HIGH) ? (INTERRUPT_RATE << 3) : INTERRUPT_RATE; urb->transfer_flags = transfer_flags; if (usb_pipein(pipe)) urb->transfer_flags |= URB_SHORT_NOT_OK; if ((bytes + offset) == 0) return urb; if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) urb->transfer_buffer = usb_alloc_coherent(udev, bytes + offset, GFP_KERNEL, &urb->transfer_dma); else urb->transfer_buffer = kmalloc(bytes + offset, GFP_KERNEL); if (!urb->transfer_buffer) { usb_free_urb(urb); return NULL; } /* To test unaligned transfers add an offset and fill the unused memory with a guard value */ if (offset) { memset(urb->transfer_buffer, GUARD_BYTE, offset); urb->transfer_buffer += offset; if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) urb->transfer_dma += offset; } /* For inbound transfers use guard byte so that test fails if data not correctly copied */ memset(urb->transfer_buffer, usb_pipein(urb->pipe) ? GUARD_BYTE : 0, bytes); return urb; } static struct urb *simple_alloc_urb( struct usb_device *udev, int pipe, unsigned long bytes, u8 bInterval) { return usbtest_alloc_urb(udev, pipe, bytes, URB_NO_TRANSFER_DMA_MAP, 0, bInterval, simple_callback); } static struct urb *complicated_alloc_urb( struct usb_device *udev, int pipe, unsigned long bytes, u8 bInterval) { return usbtest_alloc_urb(udev, pipe, bytes, URB_NO_TRANSFER_DMA_MAP, 0, bInterval, complicated_callback); } static unsigned pattern; static unsigned mod_pattern; module_param_named(pattern, mod_pattern, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(mod_pattern, "i/o pattern (0 == zeroes)"); static unsigned get_maxpacket(struct usb_device *udev, int pipe) { struct usb_host_endpoint *ep; ep = usb_pipe_endpoint(udev, pipe); return le16_to_cpup(&ep->desc.wMaxPacketSize); } static int ss_isoc_get_packet_num(struct usb_device *udev, int pipe) { struct usb_host_endpoint *ep = usb_pipe_endpoint(udev, pipe); return USB_SS_MULT(ep->ss_ep_comp.bmAttributes) * (1 + ep->ss_ep_comp.bMaxBurst); } static void simple_fill_buf(struct urb *urb) { unsigned i; u8 *buf = urb->transfer_buffer; unsigned len = urb->transfer_buffer_length; unsigned maxpacket; switch (pattern) { default: fallthrough; case 0: memset(buf, 0, len); break; case 1: /* mod63 */ maxpacket = get_maxpacket(urb->dev, urb->pipe); for (i = 0; i < len; i++) *buf++ = (u8) ((i % maxpacket) % 63); break; } } static inline unsigned long buffer_offset(void *buf) { return (unsigned long)buf & (ARCH_KMALLOC_MINALIGN - 1); } static int check_guard_bytes(struct usbtest_dev *tdev, struct urb *urb) { u8 *buf = urb->transfer_buffer; u8 *guard = buf - buffer_offset(buf); unsigned i; for (i = 0; guard < buf; i++, guard++) { if (*guard != GUARD_BYTE) { ERROR(tdev, "guard byte[%d] %d (not %d)\n", i, *guard, GUARD_BYTE); return -EINVAL; } } return 0; } static int simple_check_buf(struct usbtest_dev *tdev, struct urb *urb) { unsigned i; u8 expected; u8 *buf = urb->transfer_buffer; unsigned len = urb->actual_length; unsigned maxpacket = get_maxpacket(urb->dev, urb->pipe); int ret = check_guard_bytes(tdev, urb); if (ret) return ret; for (i = 0; i < len; i++, buf++) { switch (pattern) { /* all-zeroes has no synchronization issues */ case 0: expected = 0; break; /* mod63 stays in sync with short-terminated transfers, * or otherwise when host and gadget agree on how large * each usb transfer request should be. resync is done * with set_interface or set_config. */ case 1: /* mod63 */ expected = (i % maxpacket) % 63; break; /* always fail unsupported patterns */ default: expected = !*buf; break; } if (*buf == expected) continue; ERROR(tdev, "buf[%d] = %d (not %d)\n", i, *buf, expected); return -EINVAL; } return 0; } static void simple_free_urb(struct urb *urb) { unsigned long offset = buffer_offset(urb->transfer_buffer); if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) usb_free_coherent( urb->dev, urb->transfer_buffer_length + offset, urb->transfer_buffer - offset, urb->transfer_dma - offset); else kfree(urb->transfer_buffer - offset); usb_free_urb(urb); } static int simple_io( struct usbtest_dev *tdev, struct urb *urb, int iterations, int vary, int expected, const char *label ) { struct usb_device *udev = urb->dev; int max = urb->transfer_buffer_length; struct completion completion; int retval = 0; unsigned long expire; urb->context = &completion; while (retval == 0 && iterations-- > 0) { init_completion(&completion); if (usb_pipeout(urb->pipe)) { simple_fill_buf(urb); urb->transfer_flags |= URB_ZERO_PACKET; } retval = usb_submit_urb(urb, GFP_KERNEL); if (retval != 0) break; expire = msecs_to_jiffies(SIMPLE_IO_TIMEOUT); if (!wait_for_completion_timeout(&completion, expire)) { usb_kill_urb(urb); retval = (urb->status == -ENOENT ? -ETIMEDOUT : urb->status); } else { retval = urb->status; } urb->dev = udev; if (retval == 0 && usb_pipein(urb->pipe)) retval = simple_check_buf(tdev, urb); if (vary) { int len = urb->transfer_buffer_length; len += vary; len %= max; if (len == 0) len = (vary < max) ? vary : max; urb->transfer_buffer_length = len; } /* FIXME if endpoint halted, clear halt (and log) */ } urb->transfer_buffer_length = max; if (expected != retval) dev_err(&udev->dev, "%s failed, iterations left %d, status %d (not %d)\n", label, iterations, retval, expected); return retval; } /*-------------------------------------------------------------------------*/ /* We use scatterlist primitives to test queued I/O. * Yes, this also tests the scatterlist primitives. */ static void free_sglist(struct scatterlist *sg, int nents) { unsigned i; if (!sg) return; for (i = 0; i < nents; i++) { if (!sg_page(&sg[i])) continue; kfree(sg_virt(&sg[i])); } kfree(sg); } static struct scatterlist * alloc_sglist(int nents, int max, int vary, struct usbtest_dev *dev, int pipe) { struct scatterlist *sg; unsigned int n_size = 0; unsigned i; unsigned size = max; unsigned maxpacket = get_maxpacket(interface_to_usbdev(dev->intf), pipe); if (max == 0) return NULL; sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) return NULL; sg_init_table(sg, nents); for (i = 0; i < nents; i++) { char *buf; unsigned j; buf = kzalloc(size, GFP_KERNEL); if (!buf) { free_sglist(sg, i); return NULL; } /* kmalloc pages are always physically contiguous! */ sg_set_buf(&sg[i], buf, size); switch (pattern) { case 0: /* already zeroed */ break; case 1: for (j = 0; j < size; j++) *buf++ = (u8) (((j + n_size) % maxpacket) % 63); n_size += size; break; } if (vary) { size += vary; size %= max; if (size == 0) size = (vary < max) ? vary : max; } } return sg; } struct sg_timeout { struct timer_list timer; struct usb_sg_request *req; }; static void sg_timeout(struct timer_list *t) { struct sg_timeout *timeout = from_timer(timeout, t, timer); usb_sg_cancel(timeout->req); } static int perform_sglist( struct usbtest_dev *tdev, unsigned iterations, int pipe, struct usb_sg_request *req, struct scatterlist *sg, int nents ) { struct usb_device *udev = testdev_to_usbdev(tdev); int retval = 0; struct sg_timeout timeout = { .req = req, }; timer_setup_on_stack(&timeout.timer, sg_timeout, 0); while (retval == 0 && iterations-- > 0) { retval = usb_sg_init(req, udev, pipe, (udev->speed == USB_SPEED_HIGH) ? (INTERRUPT_RATE << 3) : INTERRUPT_RATE, sg, nents, 0, GFP_KERNEL); if (retval) break; mod_timer(&timeout.timer, jiffies + msecs_to_jiffies(SIMPLE_IO_TIMEOUT)); usb_sg_wait(req); if (!del_timer_sync(&timeout.timer)) retval = -ETIMEDOUT; else retval = req->status; destroy_timer_on_stack(&timeout.timer); /* FIXME check resulting data pattern */ /* FIXME if endpoint halted, clear halt (and log) */ } /* FIXME for unlink or fault handling tests, don't report * failure if retval is as we expected ... */ if (retval) ERROR(tdev, "perform_sglist failed, " "iterations left %d, status %d\n", iterations, retval); return retval; } /*-------------------------------------------------------------------------*/ /* unqueued control message testing * * there's a nice set of device functional requirements in chapter 9 of the * usb 2.0 spec, which we can apply to ANY device, even ones that don't use * special test firmware. * * we know the device is configured (or suspended) by the time it's visible * through usbfs. we can't change that, so we won't test enumeration (which * worked 'well enough' to get here, this time), power management (ditto), * or remote wakeup (which needs human interaction). */ static unsigned realworld = 1; module_param(realworld, uint, 0); MODULE_PARM_DESC(realworld, "clear to demand stricter spec compliance"); static int get_altsetting(struct usbtest_dev *dev) { struct usb_interface *iface = dev->intf; struct usb_device *udev = interface_to_usbdev(iface); int retval; retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_INTERFACE, USB_DIR_IN|USB_RECIP_INTERFACE, 0, iface->altsetting[0].desc.bInterfaceNumber, dev->buf, 1, USB_CTRL_GET_TIMEOUT); switch (retval) { case 1: return dev->buf[0]; case 0: retval = -ERANGE; fallthrough; default: return retval; } } static int set_altsetting(struct usbtest_dev *dev, int alternate) { struct usb_interface *iface = dev->intf; struct usb_device *udev; if (alternate < 0 || alternate >= 256) return -EINVAL; udev = interface_to_usbdev(iface); return usb_set_interface(udev, iface->altsetting[0].desc.bInterfaceNumber, alternate); } static int is_good_config(struct usbtest_dev *tdev, int len) { struct usb_config_descriptor *config; if (len < (int)sizeof(*config)) return 0; config = (struct usb_config_descriptor *) tdev->buf; switch (config->bDescriptorType) { case USB_DT_CONFIG: case USB_DT_OTHER_SPEED_CONFIG: if (config->bLength != 9) { ERROR(tdev, "bogus config descriptor length\n"); return 0; } /* this bit 'must be 1' but often isn't */ if (!realworld && !(config->bmAttributes & 0x80)) { ERROR(tdev, "high bit of config attributes not set\n"); return 0; } if (config->bmAttributes & 0x1f) { /* reserved == 0 */ ERROR(tdev, "reserved config bits set\n"); return 0; } break; default: return 0; } if (le16_to_cpu(config->wTotalLength) == len) /* read it all */ return 1; if (le16_to_cpu(config->wTotalLength) >= TBUF_SIZE) /* max partial read */ return 1; ERROR(tdev, "bogus config descriptor read size\n"); return 0; } static int is_good_ext(struct usbtest_dev *tdev, u8 *buf) { struct usb_ext_cap_descriptor *ext; u32 attr; ext = (struct usb_ext_cap_descriptor *) buf; if (ext->bLength != USB_DT_USB_EXT_CAP_SIZE) { ERROR(tdev, "bogus usb 2.0 extension descriptor length\n"); return 0; } attr = le32_to_cpu(ext->bmAttributes); /* bits[1:15] is used and others are reserved */ if (attr & ~0xfffe) { /* reserved == 0 */ ERROR(tdev, "reserved bits set\n"); return 0; } return 1; } static int is_good_ss_cap(struct usbtest_dev *tdev, u8 *buf) { struct usb_ss_cap_descriptor *ss; ss = (struct usb_ss_cap_descriptor *) buf; if (ss->bLength != USB_DT_USB_SS_CAP_SIZE) { ERROR(tdev, "bogus superspeed device capability descriptor length\n"); return 0; } /* * only bit[1] of bmAttributes is used for LTM and others are * reserved */ if (ss->bmAttributes & ~0x02) { /* reserved == 0 */ ERROR(tdev, "reserved bits set in bmAttributes\n"); return 0; } /* bits[0:3] of wSpeedSupported is used and others are reserved */ if (le16_to_cpu(ss->wSpeedSupported) & ~0x0f) { /* reserved == 0 */ ERROR(tdev, "reserved bits set in wSpeedSupported\n"); return 0; } return 1; } static int is_good_con_id(struct usbtest_dev *tdev, u8 *buf) { struct usb_ss_container_id_descriptor *con_id; con_id = (struct usb_ss_container_id_descriptor *) buf; if (con_id->bLength != USB_DT_USB_SS_CONTN_ID_SIZE) { ERROR(tdev, "bogus container id descriptor length\n"); return 0; } if (con_id->bReserved) { /* reserved == 0 */ ERROR(tdev, "reserved bits set\n"); return 0; } return 1; } /* sanity test for standard requests working with usb_control_mesg() and some * of the utility functions which use it. * * this doesn't test how endpoint halts behave or data toggles get set, since * we won't do I/O to bulk/interrupt endpoints here (which is how to change * halt or toggle). toggle testing is impractical without support from hcds. * * this avoids failing devices linux would normally work with, by not testing * config/altsetting operations for devices that only support their defaults. * such devices rarely support those needless operations. * * NOTE that since this is a sanity test, it's not examining boundary cases * to see if usbcore, hcd, and device all behave right. such testing would * involve varied read sizes and other operation sequences. */ static int ch9_postconfig(struct usbtest_dev *dev) { struct usb_interface *iface = dev->intf; struct usb_device *udev = interface_to_usbdev(iface); int i, alt, retval; /* [9.2.3] if there's more than one altsetting, we need to be able to * set and get each one. mostly trusts the descriptors from usbcore. */ for (i = 0; i < iface->num_altsetting; i++) { /* 9.2.3 constrains the range here */ alt = iface->altsetting[i].desc.bAlternateSetting; if (alt < 0 || alt >= iface->num_altsetting) { dev_err(&iface->dev, "invalid alt [%d].bAltSetting = %d\n", i, alt); } /* [real world] get/set unimplemented if there's only one */ if (realworld && iface->num_altsetting == 1) continue; /* [9.4.10] set_interface */ retval = set_altsetting(dev, alt); if (retval) { dev_err(&iface->dev, "can't set_interface = %d, %d\n", alt, retval); return retval; } /* [9.4.4] get_interface always works */ retval = get_altsetting(dev); if (retval != alt) { dev_err(&iface->dev, "get alt should be %d, was %d\n", alt, retval); return (retval < 0) ? retval : -EDOM; } } /* [real world] get_config unimplemented if there's only one */ if (!realworld || udev->descriptor.bNumConfigurations != 1) { int expected = udev->actconfig->desc.bConfigurationValue; /* [9.4.2] get_configuration always works * ... although some cheap devices (like one TI Hub I've got) * won't return config descriptors except before set_config. */ retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), USB_REQ_GET_CONFIGURATION, USB_DIR_IN | USB_RECIP_DEVICE, 0, 0, dev->buf, 1, USB_CTRL_GET_TIMEOUT); if (retval != 1 || dev->buf[0] != expected) { dev_err(&iface->dev, "get config --> %d %d (1 %d)\n", retval, dev->buf[0], expected); return (retval < 0) ? retval : -EDOM; } } /* there's always [9.4.3] a device descriptor [9.6.1] */ retval = usb_get_descriptor(udev, USB_DT_DEVICE, 0, dev->buf, sizeof(udev->descriptor)); if (retval != sizeof(udev->descriptor)) { dev_err(&iface->dev, "dev descriptor --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } /* * there's always [9.4.3] a bos device descriptor [9.6.2] in USB * 3.0 spec */ if (le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0210) { struct usb_bos_descriptor *bos = NULL; struct usb_dev_cap_header *header = NULL; unsigned total, num, length; u8 *buf; retval = usb_get_descriptor(udev, USB_DT_BOS, 0, dev->buf, sizeof(*udev->bos->desc)); if (retval != sizeof(*udev->bos->desc)) { dev_err(&iface->dev, "bos descriptor --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } bos = (struct usb_bos_descriptor *)dev->buf; total = le16_to_cpu(bos->wTotalLength); num = bos->bNumDeviceCaps; if (total > TBUF_SIZE) total = TBUF_SIZE; /* * get generic device-level capability descriptors [9.6.2] * in USB 3.0 spec */ retval = usb_get_descriptor(udev, USB_DT_BOS, 0, dev->buf, total); if (retval != total) { dev_err(&iface->dev, "bos descriptor set --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } length = sizeof(*udev->bos->desc); buf = dev->buf; for (i = 0; i < num; i++) { buf += length; if (buf + sizeof(struct usb_dev_cap_header) > dev->buf + total) break; header = (struct usb_dev_cap_header *)buf; length = header->bLength; if (header->bDescriptorType != USB_DT_DEVICE_CAPABILITY) { dev_warn(&udev->dev, "not device capability descriptor, skip\n"); continue; } switch (header->bDevCapabilityType) { case USB_CAP_TYPE_EXT: if (buf + USB_DT_USB_EXT_CAP_SIZE > dev->buf + total || !is_good_ext(dev, buf)) { dev_err(&iface->dev, "bogus usb 2.0 extension descriptor\n"); return -EDOM; } break; case USB_SS_CAP_TYPE: if (buf + USB_DT_USB_SS_CAP_SIZE > dev->buf + total || !is_good_ss_cap(dev, buf)) { dev_err(&iface->dev, "bogus superspeed device capability descriptor\n"); return -EDOM; } break; case CONTAINER_ID_TYPE: if (buf + USB_DT_USB_SS_CONTN_ID_SIZE > dev->buf + total || !is_good_con_id(dev, buf)) { dev_err(&iface->dev, "bogus container id descriptor\n"); return -EDOM; } break; default: break; } } } /* there's always [9.4.3] at least one config descriptor [9.6.3] */ for (i = 0; i < udev->descriptor.bNumConfigurations; i++) { retval = usb_get_descriptor(udev, USB_DT_CONFIG, i, dev->buf, TBUF_SIZE); if (!is_good_config(dev, retval)) { dev_err(&iface->dev, "config [%d] descriptor --> %d\n", i, retval); return (retval < 0) ? retval : -EDOM; } /* FIXME cross-checking udev->config[i] to make sure usbcore * parsed it right (etc) would be good testing paranoia */ } /* and sometimes [9.2.6.6] speed dependent descriptors */ if (le16_to_cpu(udev->descriptor.bcdUSB) == 0x0200) { struct usb_qualifier_descriptor *d = NULL; /* device qualifier [9.6.2] */ retval = usb_get_descriptor(udev, USB_DT_DEVICE_QUALIFIER, 0, dev->buf, sizeof(struct usb_qualifier_descriptor)); if (retval == -EPIPE) { if (udev->speed == USB_SPEED_HIGH) { dev_err(&iface->dev, "hs dev qualifier --> %d\n", retval); return retval; } /* usb2.0 but not high-speed capable; fine */ } else if (retval != sizeof(struct usb_qualifier_descriptor)) { dev_err(&iface->dev, "dev qualifier --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } else d = (struct usb_qualifier_descriptor *) dev->buf; /* might not have [9.6.2] any other-speed configs [9.6.4] */ if (d) { unsigned max = d->bNumConfigurations; for (i = 0; i < max; i++) { retval = usb_get_descriptor(udev, USB_DT_OTHER_SPEED_CONFIG, i, dev->buf, TBUF_SIZE); if (!is_good_config(dev, retval)) { dev_err(&iface->dev, "other speed config --> %d\n", retval); return (retval < 0) ? retval : -EDOM; } } } } /* FIXME fetch strings from at least the device descriptor */ /* [9.4.5] get_status always works */ retval = usb_get_std_status(udev, USB_RECIP_DEVICE, 0, dev->buf); if (retval) { dev_err(&iface->dev, "get dev status --> %d\n", retval); return retval; } /* FIXME configuration.bmAttributes says if we could try to set/clear * the device's remote wakeup feature ... if we can, test that here */ retval = usb_get_std_status(udev, USB_RECIP_INTERFACE, iface->altsetting[0].desc.bInterfaceNumber, dev->buf); if (retval) { dev_err(&iface->dev, "get interface status --> %d\n", retval); return retval; } /* FIXME get status for each endpoint in the interface */ return 0; } /*-------------------------------------------------------------------------*/ /* use ch9 requests to test whether: * (a) queues work for control, keeping N subtests queued and * active (auto-resubmit) for M loops through the queue. * (b) protocol stalls (control-only) will autorecover. * it's not like bulk/intr; no halt clearing. * (c) short control reads are reported and handled. * (d) queues are always processed in-order */ struct ctrl_ctx { spinlock_t lock; struct usbtest_dev *dev; struct completion complete; unsigned count; unsigned pending; int status; struct urb **urb; struct usbtest_param_32 *param; int last; }; #define NUM_SUBCASES 16 /* how many test subcases here? */ struct subcase { struct usb_ctrlrequest setup; int number; int expected; }; static void ctrl_complete(struct urb *urb) { struct ctrl_ctx *ctx = urb->context; struct usb_ctrlrequest *reqp; struct subcase *subcase; int status = urb->status; unsigned long flags; reqp = (struct usb_ctrlrequest *)urb->setup_packet; subcase = container_of(reqp, struct subcase, setup); spin_lock_irqsave(&ctx->lock, flags); ctx->count--; ctx->pending--; /* queue must transfer and complete in fifo order, unless * usb_unlink_urb() is used to unlink something not at the * physical queue head (not tested). */ if (subcase->number > 0) { if ((subcase->number - ctx->last) != 1) { ERROR(ctx->dev, "subcase %d completed out of order, last %d\n", subcase->number, ctx->last); status = -EDOM; ctx->last = subcase->number; goto error; } } ctx->last = subcase->number; /* succeed or fault in only one way? */ if (status == subcase->expected) status = 0; /* async unlink for cleanup? */ else if (status != -ECONNRESET) { /* some faults are allowed, not required */ if (subcase->expected > 0 && ( ((status == -subcase->expected /* happened */ || status == 0)))) /* didn't */ status = 0; /* sometimes more than one fault is allowed */ else if (subcase->number == 12 && status == -EPIPE) status = 0; else ERROR(ctx->dev, "subtest %d error, status %d\n", subcase->number, status); } /* unexpected status codes mean errors; ideally, in hardware */ if (status) { error: if (ctx->status == 0) { int i; ctx->status = status; ERROR(ctx->dev, "control queue %02x.%02x, err %d, " "%d left, subcase %d, len %d/%d\n", reqp->bRequestType, reqp->bRequest, status, ctx->count, subcase->number, urb->actual_length, urb->transfer_buffer_length); /* FIXME this "unlink everything" exit route should * be a separate test case. */ /* unlink whatever's still pending */ for (i = 1; i < ctx->param->sglen; i++) { struct urb *u = ctx->urb[ (i + subcase->number) % ctx->param->sglen]; if (u == urb || !u->dev) continue; spin_unlock(&ctx->lock); status = usb_unlink_urb(u); spin_lock(&ctx->lock); switch (status) { case -EINPROGRESS: case -EBUSY: case -EIDRM: continue; default: ERROR(ctx->dev, "urb unlink --> %d\n", status); } } status = ctx->status; } } /* resubmit if we need to, else mark this as done */ if ((status == 0) && (ctx->pending < ctx->count)) { status = usb_submit_urb(urb, GFP_ATOMIC); if (status != 0) { ERROR(ctx->dev, "can't resubmit ctrl %02x.%02x, err %d\n", reqp->bRequestType, reqp->bRequest, status); urb->dev = NULL; } else ctx->pending++; } else urb->dev = NULL; /* signal completion when nothing's queued */ if (ctx->pending == 0) complete(&ctx->complete); spin_unlock_irqrestore(&ctx->lock, flags); } static int test_ctrl_queue(struct usbtest_dev *dev, struct usbtest_param_32 *param) { struct usb_device *udev = testdev_to_usbdev(dev); struct urb **urb; struct ctrl_ctx context; int i; if (param->sglen == 0 || param->iterations > UINT_MAX / param->sglen) return -EOPNOTSUPP; spin_lock_init(&context.lock); context.dev = dev; init_completion(&context.complete); context.count = param->sglen * param->iterations; context.pending = 0; context.status = -ENOMEM; context.param = param; context.last = -1; /* allocate and init the urbs we'll queue. * as with bulk/intr sglists, sglen is the queue depth; it also * controls which subtests run (more tests than sglen) or rerun. */ urb = kcalloc(param->sglen, sizeof(struct urb *), GFP_KERNEL); if (!urb) return -ENOMEM; for (i = 0; i < param->sglen; i++) { int pipe = usb_rcvctrlpipe(udev, 0); unsigned len; struct urb *u; struct usb_ctrlrequest req; struct subcase *reqp; /* sign of this variable means: * -: tested code must return this (negative) error code * +: tested code may return this (negative too) error code */ int expected = 0; /* requests here are mostly expected to succeed on any * device, but some are chosen to trigger protocol stalls * or short reads. */ memset(&req, 0, sizeof(req)); req.bRequest = USB_REQ_GET_DESCRIPTOR; req.bRequestType = USB_DIR_IN|USB_RECIP_DEVICE; switch (i % NUM_SUBCASES) { case 0: /* get device descriptor */ req.wValue = cpu_to_le16(USB_DT_DEVICE << 8); len = sizeof(struct usb_device_descriptor); break; case 1: /* get first config descriptor (only) */ req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0); len = sizeof(struct usb_config_descriptor); break; case 2: /* get altsetting (OFTEN STALLS) */ req.bRequest = USB_REQ_GET_INTERFACE; req.bRequestType = USB_DIR_IN|USB_RECIP_INTERFACE; /* index = 0 means first interface */ len = 1; expected = EPIPE; break; case 3: /* get interface status */ req.bRequest = USB_REQ_GET_STATUS; req.bRequestType = USB_DIR_IN|USB_RECIP_INTERFACE; /* interface 0 */ len = 2; break; case 4: /* get device status */ req.bRequest = USB_REQ_GET_STATUS; req.bRequestType = USB_DIR_IN|USB_RECIP_DEVICE; len = 2; break; case 5: /* get device qualifier (MAY STALL) */ req.wValue = cpu_to_le16 (USB_DT_DEVICE_QUALIFIER << 8); len = sizeof(struct usb_qualifier_descriptor); if (udev->speed != USB_SPEED_HIGH) expected = EPIPE; break; case 6: /* get first config descriptor, plus interface */ req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0); len = sizeof(struct usb_config_descriptor); len += sizeof(struct usb_interface_descriptor); break; case 7: /* get interface descriptor (ALWAYS STALLS) */ req.wValue = cpu_to_le16 (USB_DT_INTERFACE << 8); /* interface == 0 */ len = sizeof(struct usb_interface_descriptor); expected = -EPIPE; break; /* NOTE: two consecutive stalls in the queue here. * that tests fault recovery a bit more aggressively. */ case 8: /* clear endpoint halt (MAY STALL) */ req.bRequest = USB_REQ_CLEAR_FEATURE; req.bRequestType = USB_RECIP_ENDPOINT; /* wValue 0 == ep halt */ /* wIndex 0 == ep0 (shouldn't halt!) */ len = 0; pipe = usb_sndctrlpipe(udev, 0); expected = EPIPE; break; case 9: /* get endpoint status */ req.bRequest = USB_REQ_GET_STATUS; req.bRequestType = USB_DIR_IN|USB_RECIP_ENDPOINT; /* endpoint 0 */ len = 2; break; case 10: /* trigger short read (EREMOTEIO) */ req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0); len = 1024; expected = -EREMOTEIO; break; /* NOTE: two consecutive _different_ faults in the queue. */ case 11: /* get endpoint descriptor (ALWAYS STALLS) */ req.wValue = cpu_to_le16(USB_DT_ENDPOINT << 8); /* endpoint == 0 */ len = sizeof(struct usb_interface_descriptor); expected = EPIPE; break; /* NOTE: sometimes even a third fault in the queue! */ case 12: /* get string 0 descriptor (MAY STALL) */ req.wValue = cpu_to_le16(USB_DT_STRING << 8); /* string == 0, for language IDs */ len = sizeof(struct usb_interface_descriptor); /* may succeed when > 4 languages */ expected = EREMOTEIO; /* or EPIPE, if no strings */ break; case 13: /* short read, resembling case 10 */ req.wValue = cpu_to_le16((USB_DT_CONFIG << 8) | 0); /* last data packet "should" be DATA1, not DATA0 */ if (udev->speed == USB_SPEED_SUPER) len = 1024 - 512; else len = 1024 - udev->descriptor.bMaxPacketSize0; expected = -EREMOTEIO; break; case 14: /* short read; try to fill the last packet */ req.wValue = cpu_to_le16((USB_DT_DEVICE << 8) | 0); /* device descriptor size == 18 bytes */ len = udev->descriptor.bMaxPacketSize0; if (udev->speed == USB_SPEED_SUPER) len = 512; switch (len) { case 8: len = 24; break; case 16: len = 32; break; } expected = -EREMOTEIO; break; case 15: req.wValue = cpu_to_le16(USB_DT_BOS << 8); if (udev->bos) len = le16_to_cpu(udev->bos->desc->wTotalLength); else len = sizeof(struct usb_bos_descriptor); if (le16_to_cpu(udev->descriptor.bcdUSB) < 0x0201) expected = -EPIPE; break; default: ERROR(dev, "bogus number of ctrl queue testcases!\n"); context.status = -EINVAL; goto cleanup; } req.wLength = cpu_to_le16(len); urb[i] = u = simple_alloc_urb(udev, pipe, len, 0); if (!u) goto cleanup; reqp = kmalloc(sizeof(*reqp), GFP_KERNEL); if (!reqp) goto cleanup; reqp->setup = req; reqp->number = i % NUM_SUBCASES; reqp->expected = expected; u->setup_packet = (char *) &reqp->setup; u->context = &context; u->complete = ctrl_complete; } /* queue the urbs */ context.urb = urb; spin_lock_irq(&context.lock); for (i = 0; i < param->sglen; i++) { context.status = usb_submit_urb(urb[i], GFP_ATOMIC); if (context.status != 0) { ERROR(dev, "can't submit urb[%d], status %d\n", i, context.status); context.count = context.pending; break; } context.pending++; } spin_unlock_irq(&context.lock); /* FIXME set timer and time out; provide a disconnect hook */ /* wait for the last one to complete */ if (context.pending > 0) wait_for_completion(&context.complete); cleanup: for (i = 0; i < param->sglen; i++) { if (!urb[i]) continue; urb[i]->dev = udev; kfree(urb[i]->setup_packet); simple_free_urb(urb[i]); } kfree(urb); return context.status; } #undef NUM_SUBCASES /*-------------------------------------------------------------------------*/ static void unlink1_callback(struct urb *urb) { int status = urb->status; /* we "know" -EPIPE (stall) never happens */ if (!status) status = usb_submit_urb(urb, GFP_ATOMIC); if (status) { urb->status = status; complete(urb->context); } } static int unlink1(struct usbtest_dev *dev, int pipe, int size, int async) { struct urb *urb; struct completion completion; int retval = 0; init_completion(&completion); urb = simple_alloc_urb(testdev_to_usbdev(dev), pipe, size, 0); if (!urb) return -ENOMEM; urb->context = &completion; urb->complete = unlink1_callback; if (usb_pipeout(urb->pipe)) { simple_fill_buf(urb); urb->transfer_flags |= URB_ZERO_PACKET; } /* keep the endpoint busy. there are lots of hc/hcd-internal * states, and testing should get to all of them over time. * * FIXME want additional tests for when endpoint is STALLing * due to errors, or is just NAKing requests. */ retval = usb_submit_urb(urb, GFP_KERNEL); if (retval != 0) { dev_err(&dev->intf->dev, "submit fail %d\n", retval); return retval; } /* unlinking that should always work. variable delay tests more * hcd states and code paths, even with little other system load. */ msleep(jiffies % (2 * INTERRUPT_RATE)); if (async) { while (!completion_done(&completion)) { retval = usb_unlink_urb(urb); if (retval == 0 && usb_pipein(urb->pipe)) retval = simple_check_buf(dev, urb); switch (retval) { case -EBUSY: case -EIDRM: /* we can't unlink urbs while they're completing * or if they've completed, and we haven't * resubmitted. "normal" drivers would prevent * resubmission, but since we're testing unlink * paths, we can't. */ ERROR(dev, "unlink retry\n"); continue; case 0: case -EINPROGRESS: break; default: dev_err(&dev->intf->dev, "unlink fail %d\n", retval); return retval; } break; } } else usb_kill_urb(urb); wait_for_completion(&completion); retval = urb->status; simple_free_urb(urb); if (async) return (retval == -ECONNRESET) ? 0 : retval - 1000; else return (retval == -ENOENT || retval == -EPERM) ? 0 : retval - 2000; } static int unlink_simple(struct usbtest_dev *dev, int pipe, int len) { int retval = 0; /* test sync and async paths */ retval = unlink1(dev, pipe, len, 1); if (!retval) retval = unlink1(dev, pipe, len, 0); return retval; } /*-------------------------------------------------------------------------*/ struct queued_ctx { struct completion complete; atomic_t pending; unsigned num; int status; struct urb **urbs; }; static void unlink_queued_callback(struct urb *urb) { int status = urb->status; struct queued_ctx *ctx = urb->context; if (ctx->status) goto done; if (urb == ctx->urbs[ctx->num - 4] || urb == ctx->urbs[ctx->num - 2]) { if (status == -ECONNRESET) goto done; /* What error should we report if the URB completed normally? */ } if (status != 0) ctx->status = status; done: if (atomic_dec_and_test(&ctx->pending)) complete(&ctx->complete); } static int unlink_queued(struct usbtest_dev *dev, int pipe, unsigned num, unsigned size) { struct queued_ctx ctx; struct usb_device *udev = testdev_to_usbdev(dev); void *buf; dma_addr_t buf_dma; int i; int retval = -ENOMEM; init_completion(&ctx.complete); atomic_set(&ctx.pending, 1); /* One more than the actual value */ ctx.num = num; ctx.status = 0; buf = usb_alloc_coherent(udev, size, GFP_KERNEL, &buf_dma); if (!buf) return retval; memset(buf, 0, size); /* Allocate and init the urbs we'll queue */ ctx.urbs = kcalloc(num, sizeof(struct urb *), GFP_KERNEL); if (!ctx.urbs) goto free_buf; for (i = 0; i < num; i++) { ctx.urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!ctx.urbs[i]) goto free_urbs; usb_fill_bulk_urb(ctx.urbs[i], udev, pipe, buf, size, unlink_queued_callback, &ctx); ctx.urbs[i]->transfer_dma = buf_dma; ctx.urbs[i]->transfer_flags = URB_NO_TRANSFER_DMA_MAP; if (usb_pipeout(ctx.urbs[i]->pipe)) { simple_fill_buf(ctx.urbs[i]); ctx.urbs[i]->transfer_flags |= URB_ZERO_PACKET; } } /* Submit all the URBs and then unlink URBs num - 4 and num - 2. */ for (i = 0; i < num; i++) { atomic_inc(&ctx.pending); retval = usb_submit_urb(ctx.urbs[i], GFP_KERNEL); if (retval != 0) { dev_err(&dev->intf->dev, "submit urbs[%d] fail %d\n", i, retval); atomic_dec(&ctx.pending); ctx.status = retval; break; } } if (i == num) { usb_unlink_urb(ctx.urbs[num - 4]); usb_unlink_urb(ctx.urbs[num - 2]); } else { while (--i >= 0) usb_unlink_urb(ctx.urbs[i]); } if (atomic_dec_and_test(&ctx.pending)) /* The extra count */ complete(&ctx.complete); wait_for_completion(&ctx.complete); retval = ctx.status; free_urbs: for (i = 0; i < num; i++) usb_free_urb(ctx.urbs[i]); kfree(ctx.urbs); free_buf: usb_free_coherent(udev, size, buf, buf_dma); return retval; } /*-------------------------------------------------------------------------*/ static int verify_not_halted(struct usbtest_dev *tdev, int ep, struct urb *urb) { int retval; u16 status; /* shouldn't look or act halted */ retval = usb_get_std_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status); if (retval < 0) { ERROR(tdev, "ep %02x couldn't get no-halt status, %d\n", ep, retval); return retval; } if (status != 0) { ERROR(tdev, "ep %02x bogus status: %04x != 0\n", ep, status); return -EINVAL; } retval = simple_io(tdev, urb, 1, 0, 0, __func__); if (retval != 0) return -EINVAL; return 0; } static int verify_halted(struct usbtest_dev *tdev, int ep, struct urb *urb) { int retval; u16 status; /* should look and act halted */ retval = usb_get_std_status(urb->dev, USB_RECIP_ENDPOINT, ep, &status); if (retval < 0) { ERROR(tdev, "ep %02x couldn't get halt status, %d\n", ep, retval); return retval; } if (status != 1) { ERROR(tdev, "ep %02x bogus status: %04x != 1\n", ep, status); return -EINVAL; } retval = simple_io(tdev, urb, 1, 0, -EPIPE, __func__); if (retval != -EPIPE) return -EINVAL; retval = simple_io(tdev, urb, 1, 0, -EPIPE, "verify_still_halted"); if (retval != -EPIPE) return -EINVAL; return 0; } static int test_halt(struct usbtest_dev *tdev, int ep, struct urb *urb) { int retval; /* shouldn't look or act halted now */ retval = verify_not_halted(tdev, ep, urb); if (retval < 0) return retval; /* set halt (protocol test only), verify it worked */ retval = usb_control_msg(urb->dev, usb_sndctrlpipe(urb->dev, 0), USB_REQ_SET_FEATURE, USB_RECIP_ENDPOINT, USB_ENDPOINT_HALT, ep, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval < 0) { ERROR(tdev, "ep %02x couldn't set halt, %d\n", ep, retval); return retval; } retval = verify_halted(tdev, ep, urb); if (retval < 0) { int ret; /* clear halt anyways, else further tests will fail */ ret = usb_clear_halt(urb->dev, urb->pipe); if (ret) ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, ret); return retval; } /* clear halt (tests API + protocol), verify it worked */ retval = usb_clear_halt(urb->dev, urb->pipe); if (retval < 0) { ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval); return retval; } retval = verify_not_halted(tdev, ep, urb); if (retval < 0) return retval; /* NOTE: could also verify SET_INTERFACE clear halts ... */ return 0; } static int test_toggle_sync(struct usbtest_dev *tdev, int ep, struct urb *urb) { int retval; /* clear initial data toggle to DATA0 */ retval = usb_clear_halt(urb->dev, urb->pipe); if (retval < 0) { ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval); return retval; } /* transfer 3 data packets, should be DATA0, DATA1, DATA0 */ retval = simple_io(tdev, urb, 1, 0, 0, __func__); if (retval != 0) return -EINVAL; /* clear halt resets device side data toggle, host should react to it */ retval = usb_clear_halt(urb->dev, urb->pipe); if (retval < 0) { ERROR(tdev, "ep %02x couldn't clear halt, %d\n", ep, retval); return retval; } /* host should use DATA0 again after clear halt */ retval = simple_io(tdev, urb, 1, 0, 0, __func__); return retval; } static int halt_simple(struct usbtest_dev *dev) { int ep; int retval = 0; struct urb *urb; struct usb_device *udev = testdev_to_usbdev(dev); if (udev->speed == USB_SPEED_SUPER) urb = simple_alloc_urb(udev, 0, 1024, 0); else urb = simple_alloc_urb(udev, 0, 512, 0); if (urb == NULL) return -ENOMEM; if (dev->in_pipe) { ep = usb_pipeendpoint(dev->in_pipe) | USB_DIR_IN; urb->pipe = dev->in_pipe; retval = test_halt(dev, ep, urb); if (retval < 0) goto done; } if (dev->out_pipe) { ep = usb_pipeendpoint(dev->out_pipe); urb->pipe = dev->out_pipe; retval = test_halt(dev, ep, urb); } done: simple_free_urb(urb); return retval; } static int toggle_sync_simple(struct usbtest_dev *dev) { int ep; int retval = 0; struct urb *urb; struct usb_device *udev = testdev_to_usbdev(dev); unsigned maxp = get_maxpacket(udev, dev->out_pipe); /* * Create a URB that causes a transfer of uneven amount of data packets * This way the clear toggle has an impact on the data toggle sequence. * Use 2 maxpacket length packets and one zero packet. */ urb = simple_alloc_urb(udev, 0, 2 * maxp, 0); if (urb == NULL) return -ENOMEM; urb->transfer_flags |= URB_ZERO_PACKET; ep = usb_pipeendpoint(dev->out_pipe); urb->pipe = dev->out_pipe; retval = test_toggle_sync(dev, ep, urb); simple_free_urb(urb); return retval; } /*-------------------------------------------------------------------------*/ /* Control OUT tests use the vendor control requests from Intel's * USB 2.0 compliance test device: write a buffer, read it back. * * Intel's spec only _requires_ that it work for one packet, which * is pretty weak. Some HCDs place limits here; most devices will * need to be able to handle more than one OUT data packet. We'll * try whatever we're told to try. */ static int ctrl_out(struct usbtest_dev *dev, unsigned count, unsigned length, unsigned vary, unsigned offset) { unsigned i, j, len; int retval; u8 *buf; char *what = "?"; struct usb_device *udev; if (length < 1 || length > 0xffff || vary >= length) return -EINVAL; buf = kmalloc(length + offset, GFP_KERNEL); if (!buf) return -ENOMEM; buf += offset; udev = testdev_to_usbdev(dev); len = length; retval = 0; /* NOTE: hardware might well act differently if we pushed it * with lots back-to-back queued requests. */ for (i = 0; i < count; i++) { /* write patterned data */ for (j = 0; j < len; j++) buf[j] = (u8)(i + j); retval = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x5b, USB_DIR_OUT|USB_TYPE_VENDOR, 0, 0, buf, len, USB_CTRL_SET_TIMEOUT); if (retval != len) { what = "write"; if (retval >= 0) { ERROR(dev, "ctrl_out, wlen %d (expected %d)\n", retval, len); retval = -EBADMSG; } break; } /* read it back -- assuming nothing intervened!! */ retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), 0x5c, USB_DIR_IN|USB_TYPE_VENDOR, 0, 0, buf, len, USB_CTRL_GET_TIMEOUT); if (retval != len) { what = "read"; if (retval >= 0) { ERROR(dev, "ctrl_out, rlen %d (expected %d)\n", retval, len); retval = -EBADMSG; } break; } /* fail if we can't verify */ for (j = 0; j < len; j++) { if (buf[j] != (u8)(i + j)) { ERROR(dev, "ctrl_out, byte %d is %d not %d\n", j, buf[j], (u8)(i + j)); retval = -EBADMSG; break; } } if (retval < 0) { what = "verify"; break; } len += vary; /* [real world] the "zero bytes IN" case isn't really used. * hardware can easily trip up in this weird case, since its * status stage is IN, not OUT like other ep0in transfers. */ if (len > length) len = realworld ? 1 : 0; } if (retval < 0) ERROR(dev, "ctrl_out %s failed, code %d, count %d\n", what, retval, i); kfree(buf - offset); return retval; } /*-------------------------------------------------------------------------*/ /* ISO/BULK tests ... mimics common usage * - buffer length is split into N packets (mostly maxpacket sized) * - multi-buffers according to sglen */ struct transfer_context { unsigned count; unsigned pending; spinlock_t lock; struct completion done; int submit_error; unsigned long errors; unsigned long packet_count; struct usbtest_dev *dev; bool is_iso; }; static void complicated_callback(struct urb *urb) { struct transfer_context *ctx = urb->context; unsigned long flags; spin_lock_irqsave(&ctx->lock, flags); ctx->count--; ctx->packet_count += urb->number_of_packets; if (urb->error_count > 0) ctx->errors += urb->error_count; else if (urb->status != 0) ctx->errors += (ctx->is_iso ? urb->number_of_packets : 1); else if (urb->actual_length != urb->transfer_buffer_length) ctx->errors++; else if (check_guard_bytes(ctx->dev, urb) != 0) ctx->errors++; if (urb->status == 0 && ctx->count > (ctx->pending - 1) && !ctx->submit_error) { int status = usb_submit_urb(urb, GFP_ATOMIC); switch (status) { case 0: goto done; default: dev_err(&ctx->dev->intf->dev, "resubmit err %d\n", status); fallthrough; case -ENODEV: /* disconnected */ case -ESHUTDOWN: /* endpoint disabled */ ctx->submit_error = 1; break; } } ctx->pending--; if (ctx->pending == 0) { if (ctx->errors) dev_err(&ctx->dev->intf->dev, "during the test, %lu errors out of %lu\n", ctx->errors, ctx->packet_count); complete(&ctx->done); } done: spin_unlock_irqrestore(&ctx->lock, flags); } static struct urb *iso_alloc_urb( struct usb_device *udev, int pipe, struct usb_endpoint_descriptor *desc, long bytes, unsigned offset ) { struct urb *urb; unsigned i, maxp, packets; if (bytes < 0 || !desc) return NULL; maxp = usb_endpoint_maxp(desc); if (udev->speed >= USB_SPEED_SUPER) maxp *= ss_isoc_get_packet_num(udev, pipe); else maxp *= usb_endpoint_maxp_mult(desc); packets = DIV_ROUND_UP(bytes, maxp); urb = usb_alloc_urb(packets, GFP_KERNEL); if (!urb) return urb; urb->dev = udev; urb->pipe = pipe; urb->number_of_packets = packets; urb->transfer_buffer_length = bytes; urb->transfer_buffer = usb_alloc_coherent(udev, bytes + offset, GFP_KERNEL, &urb->transfer_dma); if (!urb->transfer_buffer) { usb_free_urb(urb); return NULL; } if (offset) { memset(urb->transfer_buffer, GUARD_BYTE, offset); urb->transfer_buffer += offset; urb->transfer_dma += offset; } /* For inbound transfers use guard byte so that test fails if data not correctly copied */ memset(urb->transfer_buffer, usb_pipein(urb->pipe) ? GUARD_BYTE : 0, bytes); for (i = 0; i < packets; i++) { /* here, only the last packet will be short */ urb->iso_frame_desc[i].length = min((unsigned) bytes, maxp); bytes -= urb->iso_frame_desc[i].length; urb->iso_frame_desc[i].offset = maxp * i; } urb->complete = complicated_callback; /* urb->context = SET BY CALLER */ urb->interval = 1 << (desc->bInterval - 1); urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; return urb; } static int test_queue(struct usbtest_dev *dev, struct usbtest_param_32 *param, int pipe, struct usb_endpoint_descriptor *desc, unsigned offset) { struct transfer_context context; struct usb_device *udev; unsigned i; unsigned long packets = 0; int status = 0; struct urb **urbs; if (!param->sglen || param->iterations > UINT_MAX / param->sglen) return -EINVAL; if (param->sglen > MAX_SGLEN) return -EINVAL; urbs = kcalloc(param->sglen, sizeof(*urbs), GFP_KERNEL); if (!urbs) return -ENOMEM; memset(&context, 0, sizeof(context)); context.count = param->iterations * param->sglen; context.dev = dev; context.is_iso = !!desc; init_completion(&context.done); spin_lock_init(&context.lock); udev = testdev_to_usbdev(dev); for (i = 0; i < param->sglen; i++) { if (context.is_iso) urbs[i] = iso_alloc_urb(udev, pipe, desc, param->length, offset); else urbs[i] = complicated_alloc_urb(udev, pipe, param->length, 0); if (!urbs[i]) { status = -ENOMEM; goto fail; } packets += urbs[i]->number_of_packets; urbs[i]->context = &context; } packets *= param->iterations; if (context.is_iso) { int transaction_num; if (udev->speed >= USB_SPEED_SUPER) transaction_num = ss_isoc_get_packet_num(udev, pipe); else transaction_num = usb_endpoint_maxp_mult(desc); dev_info(&dev->intf->dev, "iso period %d %sframes, wMaxPacket %d, transactions: %d\n", 1 << (desc->bInterval - 1), (udev->speed >= USB_SPEED_HIGH) ? "micro" : "", usb_endpoint_maxp(desc), transaction_num); dev_info(&dev->intf->dev, "total %lu msec (%lu packets)\n", (packets * (1 << (desc->bInterval - 1))) / ((udev->speed >= USB_SPEED_HIGH) ? 8 : 1), packets); } spin_lock_irq(&context.lock); for (i = 0; i < param->sglen; i++) { ++context.pending; status = usb_submit_urb(urbs[i], GFP_ATOMIC); if (status < 0) { ERROR(dev, "submit iso[%d], error %d\n", i, status); if (i == 0) { spin_unlock_irq(&context.lock); goto fail; } simple_free_urb(urbs[i]); urbs[i] = NULL; context.pending--; context.submit_error = 1; break; } } spin_unlock_irq(&context.lock); wait_for_completion(&context.done); for (i = 0; i < param->sglen; i++) { if (urbs[i]) simple_free_urb(urbs[i]); } /* * Isochronous transfers are expected to fail sometimes. As an * arbitrary limit, we will report an error if any submissions * fail or if the transfer failure rate is > 10%. */ if (status != 0) ; else if (context.submit_error) status = -EACCES; else if (context.errors > (context.is_iso ? context.packet_count / 10 : 0)) status = -EIO; kfree(urbs); return status; fail: for (i = 0; i < param->sglen; i++) { if (urbs[i]) simple_free_urb(urbs[i]); } kfree(urbs); return status; } static int test_unaligned_bulk( struct usbtest_dev *tdev, int pipe, unsigned length, int iterations, unsigned transfer_flags, const char *label) { int retval; struct urb *urb = usbtest_alloc_urb(testdev_to_usbdev(tdev), pipe, length, transfer_flags, 1, 0, simple_callback); if (!urb) return -ENOMEM; retval = simple_io(tdev, urb, iterations, 0, 0, label); simple_free_urb(urb); return retval; } /* Run tests. */ static int usbtest_do_ioctl(struct usb_interface *intf, struct usbtest_param_32 *param) { struct usbtest_dev *dev = usb_get_intfdata(intf); struct usb_device *udev = testdev_to_usbdev(dev); struct urb *urb; struct scatterlist *sg; struct usb_sg_request req; unsigned i; int retval = -EOPNOTSUPP; if (param->iterations <= 0) return -EINVAL; if (param->sglen > MAX_SGLEN) return -EINVAL; /* * Just a bunch of test cases that every HCD is expected to handle. * * Some may need specific firmware, though it'd be good to have * one firmware image to handle all the test cases. * * FIXME add more tests! cancel requests, verify the data, control * queueing, concurrent read+write threads, and so on. */ switch (param->test_num) { case 0: dev_info(&intf->dev, "TEST 0: NOP\n"); retval = 0; break; /* Simple non-queued bulk I/O tests */ case 1: if (dev->out_pipe == 0) break; dev_info(&intf->dev, "TEST 1: write %d bytes %u times\n", param->length, param->iterations); urb = simple_alloc_urb(udev, dev->out_pipe, param->length, 0); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: bulk sink (maybe accepts short writes) */ retval = simple_io(dev, urb, param->iterations, 0, 0, "test1"); simple_free_urb(urb); break; case 2: if (dev->in_pipe == 0) break; dev_info(&intf->dev, "TEST 2: read %d bytes %u times\n", param->length, param->iterations); urb = simple_alloc_urb(udev, dev->in_pipe, param->length, 0); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: bulk source (maybe generates short writes) */ retval = simple_io(dev, urb, param->iterations, 0, 0, "test2"); simple_free_urb(urb); break; case 3: if (dev->out_pipe == 0 || param->vary == 0) break; dev_info(&intf->dev, "TEST 3: write/%d 0..%d bytes %u times\n", param->vary, param->length, param->iterations); urb = simple_alloc_urb(udev, dev->out_pipe, param->length, 0); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: bulk sink (maybe accepts short writes) */ retval = simple_io(dev, urb, param->iterations, param->vary, 0, "test3"); simple_free_urb(urb); break; case 4: if (dev->in_pipe == 0 || param->vary == 0) break; dev_info(&intf->dev, "TEST 4: read/%d 0..%d bytes %u times\n", param->vary, param->length, param->iterations); urb = simple_alloc_urb(udev, dev->in_pipe, param->length, 0); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: bulk source (maybe generates short writes) */ retval = simple_io(dev, urb, param->iterations, param->vary, 0, "test4"); simple_free_urb(urb); break; /* Queued bulk I/O tests */ case 5: if (dev->out_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 5: write %d sglists %d entries of %d bytes\n", param->iterations, param->sglen, param->length); sg = alloc_sglist(param->sglen, param->length, 0, dev, dev->out_pipe); if (!sg) { retval = -ENOMEM; break; } /* FIRMWARE: bulk sink (maybe accepts short writes) */ retval = perform_sglist(dev, param->iterations, dev->out_pipe, &req, sg, param->sglen); free_sglist(sg, param->sglen); break; case 6: if (dev->in_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 6: read %d sglists %d entries of %d bytes\n", param->iterations, param->sglen, param->length); sg = alloc_sglist(param->sglen, param->length, 0, dev, dev->in_pipe); if (!sg) { retval = -ENOMEM; break; } /* FIRMWARE: bulk source (maybe generates short writes) */ retval = perform_sglist(dev, param->iterations, dev->in_pipe, &req, sg, param->sglen); free_sglist(sg, param->sglen); break; case 7: if (dev->out_pipe == 0 || param->sglen == 0 || param->vary == 0) break; dev_info(&intf->dev, "TEST 7: write/%d %d sglists %d entries 0..%d bytes\n", param->vary, param->iterations, param->sglen, param->length); sg = alloc_sglist(param->sglen, param->length, param->vary, dev, dev->out_pipe); if (!sg) { retval = -ENOMEM; break; } /* FIRMWARE: bulk sink (maybe accepts short writes) */ retval = perform_sglist(dev, param->iterations, dev->out_pipe, &req, sg, param->sglen); free_sglist(sg, param->sglen); break; case 8: if (dev->in_pipe == 0 || param->sglen == 0 || param->vary == 0) break; dev_info(&intf->dev, "TEST 8: read/%d %d sglists %d entries 0..%d bytes\n", param->vary, param->iterations, param->sglen, param->length); sg = alloc_sglist(param->sglen, param->length, param->vary, dev, dev->in_pipe); if (!sg) { retval = -ENOMEM; break; } /* FIRMWARE: bulk source (maybe generates short writes) */ retval = perform_sglist(dev, param->iterations, dev->in_pipe, &req, sg, param->sglen); free_sglist(sg, param->sglen); break; /* non-queued sanity tests for control (chapter 9 subset) */ case 9: retval = 0; dev_info(&intf->dev, "TEST 9: ch9 (subset) control tests, %d times\n", param->iterations); for (i = param->iterations; retval == 0 && i--; /* NOP */) retval = ch9_postconfig(dev); if (retval) dev_err(&intf->dev, "ch9 subset failed, " "iterations left %d\n", i); break; /* queued control messaging */ case 10: retval = 0; dev_info(&intf->dev, "TEST 10: queue %d control calls, %d times\n", param->sglen, param->iterations); retval = test_ctrl_queue(dev, param); break; /* simple non-queued unlinks (ring with one urb) */ case 11: if (dev->in_pipe == 0 || !param->length) break; retval = 0; dev_info(&intf->dev, "TEST 11: unlink %d reads of %d\n", param->iterations, param->length); for (i = param->iterations; retval == 0 && i--; /* NOP */) retval = unlink_simple(dev, dev->in_pipe, param->length); if (retval) dev_err(&intf->dev, "unlink reads failed %d, " "iterations left %d\n", retval, i); break; case 12: if (dev->out_pipe == 0 || !param->length) break; retval = 0; dev_info(&intf->dev, "TEST 12: unlink %d writes of %d\n", param->iterations, param->length); for (i = param->iterations; retval == 0 && i--; /* NOP */) retval = unlink_simple(dev, dev->out_pipe, param->length); if (retval) dev_err(&intf->dev, "unlink writes failed %d, " "iterations left %d\n", retval, i); break; /* ep halt tests */ case 13: if (dev->out_pipe == 0 && dev->in_pipe == 0) break; retval = 0; dev_info(&intf->dev, "TEST 13: set/clear %d halts\n", param->iterations); for (i = param->iterations; retval == 0 && i--; /* NOP */) retval = halt_simple(dev); if (retval) ERROR(dev, "halts failed, iterations left %d\n", i); break; /* control write tests */ case 14: if (!dev->info->ctrl_out) break; dev_info(&intf->dev, "TEST 14: %d ep0out, %d..%d vary %d\n", param->iterations, realworld ? 1 : 0, param->length, param->vary); retval = ctrl_out(dev, param->iterations, param->length, param->vary, 0); break; /* iso write tests */ case 15: if (dev->out_iso_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 15: write %d iso, %d entries of %d bytes\n", param->iterations, param->sglen, param->length); /* FIRMWARE: iso sink */ retval = test_queue(dev, param, dev->out_iso_pipe, dev->iso_out, 0); break; /* iso read tests */ case 16: if (dev->in_iso_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 16: read %d iso, %d entries of %d bytes\n", param->iterations, param->sglen, param->length); /* FIRMWARE: iso source */ retval = test_queue(dev, param, dev->in_iso_pipe, dev->iso_in, 0); break; /* FIXME scatterlist cancel (needs helper thread) */ /* Tests for bulk I/O using DMA mapping by core and odd address */ case 17: if (dev->out_pipe == 0) break; dev_info(&intf->dev, "TEST 17: write odd addr %d bytes %u times core map\n", param->length, param->iterations); retval = test_unaligned_bulk( dev, dev->out_pipe, param->length, param->iterations, 0, "test17"); break; case 18: if (dev->in_pipe == 0) break; dev_info(&intf->dev, "TEST 18: read odd addr %d bytes %u times core map\n", param->length, param->iterations); retval = test_unaligned_bulk( dev, dev->in_pipe, param->length, param->iterations, 0, "test18"); break; /* Tests for bulk I/O using premapped coherent buffer and odd address */ case 19: if (dev->out_pipe == 0) break; dev_info(&intf->dev, "TEST 19: write odd addr %d bytes %u times premapped\n", param->length, param->iterations); retval = test_unaligned_bulk( dev, dev->out_pipe, param->length, param->iterations, URB_NO_TRANSFER_DMA_MAP, "test19"); break; case 20: if (dev->in_pipe == 0) break; dev_info(&intf->dev, "TEST 20: read odd addr %d bytes %u times premapped\n", param->length, param->iterations); retval = test_unaligned_bulk( dev, dev->in_pipe, param->length, param->iterations, URB_NO_TRANSFER_DMA_MAP, "test20"); break; /* control write tests with unaligned buffer */ case 21: if (!dev->info->ctrl_out) break; dev_info(&intf->dev, "TEST 21: %d ep0out odd addr, %d..%d vary %d\n", param->iterations, realworld ? 1 : 0, param->length, param->vary); retval = ctrl_out(dev, param->iterations, param->length, param->vary, 1); break; /* unaligned iso tests */ case 22: if (dev->out_iso_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 22: write %d iso odd, %d entries of %d bytes\n", param->iterations, param->sglen, param->length); retval = test_queue(dev, param, dev->out_iso_pipe, dev->iso_out, 1); break; case 23: if (dev->in_iso_pipe == 0 || param->sglen == 0) break; dev_info(&intf->dev, "TEST 23: read %d iso odd, %d entries of %d bytes\n", param->iterations, param->sglen, param->length); retval = test_queue(dev, param, dev->in_iso_pipe, dev->iso_in, 1); break; /* unlink URBs from a bulk-OUT queue */ case 24: if (dev->out_pipe == 0 || !param->length || param->sglen < 4) break; retval = 0; dev_info(&intf->dev, "TEST 24: unlink from %d queues of " "%d %d-byte writes\n", param->iterations, param->sglen, param->length); for (i = param->iterations; retval == 0 && i > 0; --i) { retval = unlink_queued(dev, dev->out_pipe, param->sglen, param->length); if (retval) { dev_err(&intf->dev, "unlink queued writes failed %d, " "iterations left %d\n", retval, i); break; } } break; /* Simple non-queued interrupt I/O tests */ case 25: if (dev->out_int_pipe == 0) break; dev_info(&intf->dev, "TEST 25: write %d bytes %u times\n", param->length, param->iterations); urb = simple_alloc_urb(udev, dev->out_int_pipe, param->length, dev->int_out->bInterval); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: interrupt sink (maybe accepts short writes) */ retval = simple_io(dev, urb, param->iterations, 0, 0, "test25"); simple_free_urb(urb); break; case 26: if (dev->in_int_pipe == 0) break; dev_info(&intf->dev, "TEST 26: read %d bytes %u times\n", param->length, param->iterations); urb = simple_alloc_urb(udev, dev->in_int_pipe, param->length, dev->int_in->bInterval); if (!urb) { retval = -ENOMEM; break; } /* FIRMWARE: interrupt source (maybe generates short writes) */ retval = simple_io(dev, urb, param->iterations, 0, 0, "test26"); simple_free_urb(urb); break; case 27: /* We do performance test, so ignore data compare */ if (dev->out_pipe == 0 || param->sglen == 0 || pattern != 0) break; dev_info(&intf->dev, "TEST 27: bulk write %dMbytes\n", (param->iterations * param->sglen * param->length) / (1024 * 1024)); retval = test_queue(dev, param, dev->out_pipe, NULL, 0); break; case 28: if (dev->in_pipe == 0 || param->sglen == 0 || pattern != 0) break; dev_info(&intf->dev, "TEST 28: bulk read %dMbytes\n", (param->iterations * param->sglen * param->length) / (1024 * 1024)); retval = test_queue(dev, param, dev->in_pipe, NULL, 0); break; /* Test data Toggle/seq_nr clear between bulk out transfers */ case 29: if (dev->out_pipe == 0) break; retval = 0; dev_info(&intf->dev, "TEST 29: Clear toggle between bulk writes %d times\n", param->iterations); for (i = param->iterations; retval == 0 && i > 0; --i) retval = toggle_sync_simple(dev); if (retval) ERROR(dev, "toggle sync failed, iterations left %d\n", i); break; } return retval; } /*-------------------------------------------------------------------------*/ /* We only have this one interface to user space, through usbfs. * User mode code can scan usbfs to find N different devices (maybe on * different busses) to use when testing, and allocate one thread per * test. So discovery is simplified, and we have no device naming issues. * * Don't use these only as stress/load tests. Use them along with * other USB bus activity: plugging, unplugging, mousing, mp3 playback, * video capture, and so on. Run different tests at different times, in * different sequences. Nothing here should interact with other devices, * except indirectly by consuming USB bandwidth and CPU resources for test * threads and request completion. But the only way to know that for sure * is to test when HC queues are in use by many devices. * * WARNING: Because usbfs grabs udev->dev.sem before calling this ioctl(), * it locks out usbcore in certain code paths. Notably, if you disconnect * the device-under-test, hub_wq will wait block forever waiting for the * ioctl to complete ... so that usb_disconnect() can abort the pending * urbs and then call usbtest_disconnect(). To abort a test, you're best * off just killing the userspace task and waiting for it to exit. */ static int usbtest_ioctl(struct usb_interface *intf, unsigned int code, void *buf) { struct usbtest_dev *dev = usb_get_intfdata(intf); struct usbtest_param_64 *param_64 = buf; struct usbtest_param_32 temp; struct usbtest_param_32 *param_32 = buf; struct timespec64 start; struct timespec64 end; struct timespec64 duration; int retval = -EOPNOTSUPP; /* FIXME USBDEVFS_CONNECTINFO doesn't say how fast the device is. */ pattern = mod_pattern; if (mutex_lock_interruptible(&dev->lock)) return -ERESTARTSYS; /* FIXME: What if a system sleep starts while a test is running? */ /* some devices, like ez-usb default devices, need a non-default * altsetting to have any active endpoints. some tests change * altsettings; force a default so most tests don't need to check. */ if (dev->info->alt >= 0) { if (intf->altsetting->desc.bInterfaceNumber) { retval = -ENODEV; goto free_mutex; } retval = set_altsetting(dev, dev->info->alt); if (retval) { dev_err(&intf->dev, "set altsetting to %d failed, %d\n", dev->info->alt, retval); goto free_mutex; } } switch (code) { case USBTEST_REQUEST_64: temp.test_num = param_64->test_num; temp.iterations = param_64->iterations; temp.length = param_64->length; temp.sglen = param_64->sglen; temp.vary = param_64->vary; param_32 = &temp; break; case USBTEST_REQUEST_32: break; default: retval = -EOPNOTSUPP; goto free_mutex; } ktime_get_ts64(&start); retval = usbtest_do_ioctl(intf, param_32); if (retval < 0) goto free_mutex; ktime_get_ts64(&end); duration = timespec64_sub(end, start); temp.duration_sec = duration.tv_sec; temp.duration_usec = duration.tv_nsec/NSEC_PER_USEC; switch (code) { case USBTEST_REQUEST_32: param_32->duration_sec = temp.duration_sec; param_32->duration_usec = temp.duration_usec; break; case USBTEST_REQUEST_64: param_64->duration_sec = temp.duration_sec; param_64->duration_usec = temp.duration_usec; break; } free_mutex: mutex_unlock(&dev->lock); return retval; } /*-------------------------------------------------------------------------*/ static unsigned force_interrupt; module_param(force_interrupt, uint, 0); MODULE_PARM_DESC(force_interrupt, "0 = test default; else interrupt"); #ifdef GENERIC static unsigned short vendor; module_param(vendor, ushort, 0); MODULE_PARM_DESC(vendor, "vendor code (from usb-if)"); static unsigned short product; module_param(product, ushort, 0); MODULE_PARM_DESC(product, "product code (from vendor)"); #endif static int usbtest_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev; struct usbtest_dev *dev; struct usbtest_info *info; char *rtest, *wtest; char *irtest, *iwtest; char *intrtest, *intwtest; udev = interface_to_usbdev(intf); #ifdef GENERIC /* specify devices by module parameters? */ if (id->match_flags == 0) { /* vendor match required, product match optional */ if (!vendor || le16_to_cpu(udev->descriptor.idVendor) != (u16)vendor) return -ENODEV; if (product && le16_to_cpu(udev->descriptor.idProduct) != (u16)product) return -ENODEV; dev_info(&intf->dev, "matched module params, " "vend=0x%04x prod=0x%04x\n", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); } #endif dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; info = (struct usbtest_info *) id->driver_info; dev->info = info; mutex_init(&dev->lock); dev->intf = intf; /* cacheline-aligned scratch for i/o */ dev->buf = kmalloc(TBUF_SIZE, GFP_KERNEL); if (dev->buf == NULL) { kfree(dev); return -ENOMEM; } /* NOTE this doesn't yet test the handful of difference that are * visible with high speed interrupts: bigger maxpacket (1K) and * "high bandwidth" modes (up to 3 packets/uframe). */ rtest = wtest = ""; irtest = iwtest = ""; intrtest = intwtest = ""; if (force_interrupt || udev->speed == USB_SPEED_LOW) { if (info->ep_in) { dev->in_pipe = usb_rcvintpipe(udev, info->ep_in); rtest = " intr-in"; } if (info->ep_out) { dev->out_pipe = usb_sndintpipe(udev, info->ep_out); wtest = " intr-out"; } } else { if (override_alt >= 0 || info->autoconf) { int status; status = get_endpoints(dev, intf); if (status < 0) { WARNING(dev, "couldn't get endpoints, %d\n", status); kfree(dev->buf); kfree(dev); return status; } /* may find bulk or ISO pipes */ } else { if (info->ep_in) dev->in_pipe = usb_rcvbulkpipe(udev, info->ep_in); if (info->ep_out) dev->out_pipe = usb_sndbulkpipe(udev, info->ep_out); } if (dev->in_pipe) rtest = " bulk-in"; if (dev->out_pipe) wtest = " bulk-out"; if (dev->in_iso_pipe) irtest = " iso-in"; if (dev->out_iso_pipe) iwtest = " iso-out"; if (dev->in_int_pipe) intrtest = " int-in"; if (dev->out_int_pipe) intwtest = " int-out"; } usb_set_intfdata(intf, dev); dev_info(&intf->dev, "%s\n", info->name); dev_info(&intf->dev, "%s {control%s%s%s%s%s%s%s} tests%s\n", usb_speed_string(udev->speed), info->ctrl_out ? " in/out" : "", rtest, wtest, irtest, iwtest, intrtest, intwtest, info->alt >= 0 ? " (+alt)" : ""); return 0; } static int usbtest_suspend(struct usb_interface *intf, pm_message_t message) { return 0; } static int usbtest_resume(struct usb_interface *intf) { return 0; } static void usbtest_disconnect(struct usb_interface *intf) { struct usbtest_dev *dev = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); dev_dbg(&intf->dev, "disconnect\n"); kfree(dev->buf); kfree(dev); } /* Basic testing only needs a device that can source or sink bulk traffic. * Any device can test control transfers (default with GENERIC binding). * * Several entries work with the default EP0 implementation that's built * into EZ-USB chips. There's a default vendor ID which can be overridden * by (very) small config EEPROMS, but otherwise all these devices act * identically until firmware is loaded: only EP0 works. It turns out * to be easy to make other endpoints work, without modifying that EP0 * behavior. For now, we expect that kind of firmware. */ /* an21xx or fx versions of ez-usb */ static struct usbtest_info ez1_info = { .name = "EZ-USB device", .ep_in = 2, .ep_out = 2, .alt = 1, }; /* fx2 version of ez-usb */ static struct usbtest_info ez2_info = { .name = "FX2 device", .ep_in = 6, .ep_out = 2, .alt = 1, }; /* ezusb family device with dedicated usb test firmware, */ static struct usbtest_info fw_info = { .name = "usb test device", .ep_in = 2, .ep_out = 2, .alt = 1, .autoconf = 1, /* iso and ctrl_out need autoconf */ .ctrl_out = 1, .iso = 1, /* iso_ep's are #8 in/out */ }; /* peripheral running Linux and 'zero.c' test firmware, or * its user-mode cousin. different versions of this use * different hardware with the same vendor/product codes. * host side MUST rely on the endpoint descriptors. */ static struct usbtest_info gz_info = { .name = "Linux gadget zero", .autoconf = 1, .ctrl_out = 1, .iso = 1, .intr = 1, .alt = 0, }; static struct usbtest_info um_info = { .name = "Linux user mode test driver", .autoconf = 1, .alt = -1, }; static struct usbtest_info um2_info = { .name = "Linux user mode ISO test driver", .autoconf = 1, .iso = 1, .alt = -1, }; #ifdef IBOT2 /* this is a nice source of high speed bulk data; * uses an FX2, with firmware provided in the device */ static struct usbtest_info ibot2_info = { .name = "iBOT2 webcam", .ep_in = 2, .alt = -1, }; #endif #ifdef GENERIC /* we can use any device to test control traffic */ static struct usbtest_info generic_info = { .name = "Generic USB device", .alt = -1, }; #endif static const struct usb_device_id id_table[] = { /*-------------------------------------------------------------*/ /* EZ-USB devices which download firmware to replace (or in our * case augment) the default device implementation. */ /* generic EZ-USB FX controller */ { USB_DEVICE(0x0547, 0x2235), .driver_info = (unsigned long) &ez1_info, }, /* CY3671 development board with EZ-USB FX */ { USB_DEVICE(0x0547, 0x0080), .driver_info = (unsigned long) &ez1_info, }, /* generic EZ-USB FX2 controller (or development board) */ { USB_DEVICE(0x04b4, 0x8613), .driver_info = (unsigned long) &ez2_info, }, /* re-enumerated usb test device firmware */ { USB_DEVICE(0xfff0, 0xfff0), .driver_info = (unsigned long) &fw_info, }, /* "Gadget Zero" firmware runs under Linux */ { USB_DEVICE(0x0525, 0xa4a0), .driver_info = (unsigned long) &gz_info, }, /* so does a user-mode variant */ { USB_DEVICE(0x0525, 0xa4a4), .driver_info = (unsigned long) &um_info, }, /* ... and a user-mode variant that talks iso */ { USB_DEVICE(0x0525, 0xa4a3), .driver_info = (unsigned long) &um2_info, }, #ifdef KEYSPAN_19Qi /* Keyspan 19qi uses an21xx (original EZ-USB) */ /* this does not coexist with the real Keyspan 19qi driver! */ { USB_DEVICE(0x06cd, 0x010b), .driver_info = (unsigned long) &ez1_info, }, #endif /*-------------------------------------------------------------*/ #ifdef IBOT2 /* iBOT2 makes a nice source of high speed bulk-in data */ /* this does not coexist with a real iBOT2 driver! */ { USB_DEVICE(0x0b62, 0x0059), .driver_info = (unsigned long) &ibot2_info, }, #endif /*-------------------------------------------------------------*/ #ifdef GENERIC /* module params can specify devices to use for control tests */ { .driver_info = (unsigned long) &generic_info, }, #endif /*-------------------------------------------------------------*/ { } }; MODULE_DEVICE_TABLE(usb, id_table); static struct usb_driver usbtest_driver = { .name = "usbtest", .id_table = id_table, .probe = usbtest_probe, .unlocked_ioctl = usbtest_ioctl, .disconnect = usbtest_disconnect, .suspend = usbtest_suspend, .resume = usbtest_resume, }; /*-------------------------------------------------------------------------*/ static int __init usbtest_init(void) { #ifdef GENERIC if (vendor) pr_debug("params: vend=0x%04x prod=0x%04x\n", vendor, product); #endif return usb_register(&usbtest_driver); } module_init(usbtest_init); static void __exit usbtest_exit(void) { usb_deregister(&usbtest_driver); } module_exit(usbtest_exit); MODULE_DESCRIPTION("USB Core/HCD Testing Driver"); MODULE_LICENSE("GPL"); |
| 287 287 287 287 797 797 106 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal.h: mm/ internal definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/tracepoint-defs.h> struct folio_batch; /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints * about IO, FS and watermark checking while ignoring placement * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation cpuset and node placement constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) /* * Different from WARN_ON_ONCE(), no warning will be issued * when we specify __GFP_NOWARN. */ #define WARN_ON_ONCE_GFP(cond, gfp) ({ \ static bool __section(".data.once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ __warned = true; \ WARN_ON(1); \ } \ unlikely(__ret_warn_once); \ }) void page_writeback_init(void); /* * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. */ #define ENTIRELY_MAPPED 0x800000 #define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1) /* * Flags passed to __show_mem() and show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. */ static inline int folio_nr_pages_mapped(struct folio *folio) { return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) { pg_data_t *pgdat = folio_pgdat(folio); int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); if (nr_throttled) __acct_reclaim_writeback(pgdat, folio, nr_throttled); } static inline void wake_throttle_isolated(pg_data_t *pgdat) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; if (waitqueue_active(wqh)) wake_up(wqh); } vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long mapping_evict_folio(struct address_space *mapping, struct folio *folio); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); /** * folio_evictable - Test whether a folio is evictable. * @folio: The folio to test. * * Test whether @folio is evictable -- i.e., should be placed on * active/inactive lists vs unevictable list. * * Reasons folio might not be evictable: * 1. folio's mapping marked unevictable * 2. One of the pages in the folio is part of an mlocked VMA */ static inline bool folio_evictable(struct folio *folio) { bool ret; /* Prevent address_space of inode and swap cache from being freed */ rcu_read_lock(); ret = !mapping_unevictable(folio_mapping(folio)) && !folio_test_mlocked(folio); rcu_read_unlock(); return ret; } /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } /* * Return true if a folio needs ->release_folio() calling upon it. */ static inline bool folio_needs_release(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); return folio_has_private(folio) || (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; /* * Maximum number of reclaim retries without progress before the OOM * killer is consider the only way forward. */ #define MAX_RECLAIM_RETRIES 16 /* * in mm/vmscan.c: */ bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* * in mm/rmap.c: */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ #define K(x) ((x) << (PAGE_SHIFT-10)) extern char * const zone_names[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); extern int min_free_kbytes; void setup_per_zone_wmarks(void); void calculate_min_free_kbytes(void); int __meminit init_per_zone_wmark_min(void); void page_alloc_sysctl_init(void); /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; /* * highest_zoneidx represents highest usable zone index of * the allocation request. Due to the nature of the zone, * memory on lower zone than the highest_zoneidx will be * protected by lowmem_reserve[highest_zoneidx]. * * highest_zoneidx is also used by reclaim/compaction to limit * the target zone since higher zone than this index cannot be * usable for this allocation request. */ enum zone_type highest_zoneidx; bool spread_dirty_pages; }; /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use buddy_order_unsafe() below. */ static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * * For recording whether a page is in the buddy system, we set PageBuddy. * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ static inline bool page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; if (buddy_order(buddy) != order) return false; /* * zone check is done late to avoid uselessly calculating * zone/node ids for pages that could never merge. */ if (page_zone_id(page) != page_zone_id(buddy)) return false; VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return true; } /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). * * 1) Any buddy B1 will have an order O twin B2 which satisfies * the following equation: * B2 = B1 ^ (1 << O) * For example, if the starting buddy (buddy2) is #8 its order * 1 buddy is #10: * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 * * 2) Any buddy B will have an order O+1 parent P which * satisfies the following equation: * P = B & ~(1 << O) * * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) { return page_pfn ^ (1 << order); } /* * Find the buddy of @page and validate it. * @page: The input page * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the * function is used in the performance-critical __free_one_page(). * @order: The order of the page * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to * page_to_pfn(). * * The found buddy can be a non PageBuddy, out of @page's zone, or its order is * not the same as @page. The validation is necessary before use it. * * Return: the found buddy page or NULL if not found. */ static inline struct page *find_buddy_page_pfn(struct page *page, unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) { unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); struct page *buddy; buddy = page + (__buddy_pfn - pfn); if (buddy_pfn) *buddy_pfn = __buddy_pfn; if (page_is_buddy(page, buddy, order)) return buddy; return NULL; } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone); static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { if (zone->contiguous) return pfn_to_page(start_pfn); return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } void set_zone_contiguous(struct zone *zone); static inline void clear_zone_contiguous(struct zone *zone) { zone->contiguous = false; } extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); /* * This will have no effect, other than possibly generating a warning, if the * caller passes in a non-large folio. */ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef CONFIG_64BIT folio->_folio_nr_pages = 1U << order; #endif } void folio_undo_large_rmappable(struct folio *folio); static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; if (folio && folio_order(folio) > 1) folio_prep_large_rmappable(folio); return folio; } static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); } static inline void prep_compound_tail(struct page *head, int tail_idx) { struct page *p = head + tail_idx; p->mapping = TAIL_MAPPING; set_compound_head(p, head); set_page_private(p, 0); } extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); extern void zone_pcp_init(struct zone *zone); extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); int split_free_page(struct page *free_page, unsigned int order, unsigned long split_pfn_offset); #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* * in mm/compaction.c */ /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts * at the end of a zone and migrate_pfn begins at the start. Movable pages * are moved to the end of a zone during a compaction run and the run * completes when free_pfn <= migrate_pfn */ struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ /* * Acts as an in/out parameter to page isolation for migration. * isolate_migratepages uses it as a search base. * isolate_migratepages_block will update the value to the next pfn * after the last isolated one. */ unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ bool finish_pageblock; /* Scan the remainder of a pageblock. Used * when there are potentially transient * isolation or migration failures to * ensure forward progress. */ bool alloc_contig; /* alloc_contig_range allocation */ }; /* * Used in direct compaction when a page should be taken from the freelists * immediately when one is created during the free path. */ struct capture_control { struct compact_control *cc; struct page *page; }; unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool only_stealable, bool *can_steal); static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } /* * These three helpers classifies VMAs for virtual memory accounting. */ /* * Executable code area - executable, not writable, not stack */ static inline bool is_exec_mapping(vm_flags_t flags) { return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } /* * Stack area (including shadow stacks) * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); } /* * Data area - private, writable, not stack */ static inline bool is_data_mapping(vm_flags_t flags) { return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } /* mm/util.c */ struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the * range. * "fully mapped" means all the pages of folio is associated with the page * table of range while this function just check whether the folio range is * within the range [start, end). Function caller needs to do page table * check if it cares about the page table association. * * Typical usage (like mlock or madvise) is: * Caller knows at least 1 page of folio is associated with page table of VMA * and the range [start, end) is intersect with the VMA range. Caller wants * to know whether the folio is fully associated with the range. It calls * this function to check whether the folio is in the range first. Then checks * the page table to know whether the folio is fully mapped to the range. */ static inline bool folio_within_range(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end) { pgoff_t pgoff, addr; unsigned long vma_pglen = vma_pages(vma); VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); if (start > end) return false; if (start < vma->vm_start) start = vma->vm_start; if (end > vma->vm_end) end = vma->vm_end; pgoff = folio_pgoff(folio); /* if folio start address is not in vma range */ if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) return false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); return !(addr < start || end - addr < folio_size(folio)); } static inline bool folio_within_vma(struct folio *folio, struct vm_area_struct *vma) { return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); } /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at * the end of folio_remove_rmap_*(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. * 1) VM_IO check prevents migration from double-counting during mlock. * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED * is never left set on a VM_SPECIAL vma, there is an interval while * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * munlock if the function is called. Ideally, we should only * do munlock if any page of folio is unmapped from VMA and * cause folio not fully mapped to VMA. * * But it's not easy to confirm that's the situation. So we * always munlock the folio and page reclaim will correct it * if it's wrong. */ if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* * Return the start of user virtual address at the specific offset within * a vma. */ static inline unsigned long vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, struct vm_area_struct *vma) { unsigned long address; if (pgoff >= vma->vm_pgoff) { address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address >= vma->vm_end) address = -EFAULT; } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) { /* Test above avoids possibility of wrap to 0 on 32-bit */ address = vma->vm_start; } else { address = -EFAULT; } return address; } /* * Return the start of user virtual address of a page within a vma. * Returns -EFAULT if all of the page is outside the range of vma. * If page is a compound head, the entire compound page is considered. */ static inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma); } /* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; /* Common case, plus ->pgoff is invalid for KSM */ if (pvmw->nr_pages == 1) return pvmw->address + PAGE_SIZE; pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) address = vma->vm_end; return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, struct file *fpin) { int flags = vmf->flags; if (fpin) return fpin; /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_lock if only * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); release_fault_lock(vmf); } return fpin; } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } #endif /* !CONFIG_MMU */ /* Memory initialisation debug and verification */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, MMINIT_TRACE }; #ifdef CONFIG_DEBUG_MEMORY_INIT extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) extern void mminit_verify_pageflags_layout(void); extern void mminit_verify_zonelist(void); #else static inline void mminit_dprintk(enum mminit_level level, const char *prefix, const char *fmt, ...) { } static inline void mminit_verify_pageflags_layout(void) { } static inline void mminit_verify_zonelist(void) { } #endif /* CONFIG_DEBUG_MEMORY_INIT */ #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } static inline int find_next_best_node(int node, nodemask_t *used_node_mask) { return NUMA_NO_NODE; } #endif /* * mm/memory-failure.c */ extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN #define ALLOC_WMARK_LOW WMARK_LOW #define ALLOC_WMARK_HIGH WMARK_HIGH #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) /* * Only MMU archs have async oom victim reclaim - aka oom_reaper so we * cannot assume a reduced access to memory reserves is sufficient for * !MMU */ #ifdef CONFIG_MMU #define ALLOC_OOM 0x08 #else #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access * to 25% of the min watermark or * 62.5% if __GFP_HIGH is set. */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ #else #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; /* * only for MM internal work items which do not depend on * any allocations or locks which might depend on allocations */ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { } static inline void try_to_unmap_flush_dirty(void) { } static inline void flush_tlb_batched_pending(struct mm_struct *mm) { } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags pagetype_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; static inline bool is_migrate_highatomic(enum migratetype migratetype) { return migratetype == MIGRATE_HIGHATOMIC; } static inline bool is_migrate_highatomic_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; } void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; }; /* * mm/filemap.c */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size); /* * mm/vmalloc.c */ #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); #else static inline void vmalloc_init(void) { } static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { return -EINVAL; } #endif int __must_check __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); void free_zone_device_page(struct page *page); int migrate_device_coherent_page(struct page *page); /* * mm/gup.c */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); /* * mm/huge_memory.c */ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); /* * mm/mmap.c */ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta); enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, /* a retry, previous pass started an IO */ FOLL_TRIED = 1 << 17, /* we are working on non-current tsk/mm */ FOLL_REMOTE = 1 << 18, /* pages must be released via unpin_user_page */ FOLL_PIN = 1 << 19, /* gup_fast: prevent fall-back to slow gup */ FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ FOLL_FAST_ONLY | FOLL_UNLOCKABLE) /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the * GUP pin will remain consistent with the pages mapped into the page tables * of the MM. * * Temporary unmapping of PageAnonExclusive() pages or clearing of * PageAnonExclusive() has to protect against concurrent GUP: * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq * * GUP-fast and KSM or temporary unmapping (swap, migration): see * folio_try_share_anon_rmap_*() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. * * If the vma is NULL, we're coming from the GUP-fast path and might have * to fallback to the slow path just to lookup the vma. */ static inline bool gup_must_unshare(struct vm_area_struct *vma, unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry * has to be writable -- and if it references (part of) an anonymous * folio, that part is required to be marked exclusive. */ if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) return false; /* * Note: PageAnon(page) is stable until the page is actually getting * freed. */ if (!PageAnon(page)) { /* * We only care about R/O long-term pining: R/O short-term * pinning does not have the semantics to observe successive * changes through the process page tables. */ if (!(flags & FOLL_LONGTERM)) return false; /* We really need the vma ... */ if (!vma) return true; /* * ... because we only care about writable private ("COW") * mappings where we have to break COW early. */ return is_cow_mapping(vma->vm_flags); } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) smp_rmb(); /* * During GUP-fast we might not get called on the head page for a * hugetlb page that is mapped using cont-PTE, because GUP-fast does * not work with the abstracted hugetlb PTEs that always point at the * head page. For hugetlb, PageAnonExclusive only applies on the head * page (as it cannot be partially COW-shared), so lookup the head page. */ if (unlikely(!PageHead(page) && PageHuge(page))) page = compound_head(page); /* * Note that PageKsm() pages cannot be exclusive, and consequently, * cannot get pinned. */ return !PageAnonExclusive(page); } extern bool mirrored_kernelcore; extern bool memblock_has_mirror(void); static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty * enablements, because when without soft-dirty being compiled in, * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return false; /* * Soft-dirty is kind of special: its tracking is enabled when the * vma flags not set. */ return !(vma->vm_flags & VM_SOFTDIRTY); } static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { __mas_set_range(&vmi->mas, index, last - 1); } /* * VMA Iterator functions shared between nommu and mmap */ static inline int vma_iter_prealloc(struct vma_iterator *vmi, struct vm_area_struct *vma) { return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi) { mas_store_prealloc(&vmi->mas, NULL); } static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); } /* Store a VMA with preallocated memory */ static inline void vma_iter_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } #endif if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } /* * VMA lock generalization */ struct vma_prepare { struct vm_area_struct *vma; struct vm_area_struct *adj_next; struct file *file; struct address_space *mapping; struct anon_vma *anon_vma; struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; }; void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); return shrinker->name ? 0 : -ENOMEM; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { kfree_const(shrinker->name); shrinker->name = NULL; } extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list ap) { return 0; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { } static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { *debugfs_id = -1; return NULL; } static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id) { } #endif /* CONFIG_SHRINKER_DEBUG */ #endif /* __MM_INTERNAL_H */ |
| 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; struct crypto_shash *siw_crypto_shash; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static void siw_destroy_cpulist(int number) { int i = 0; while (i < number) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; } static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; siw_destroy_cpulist(i); return -ENOMEM; } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } /* * Network link becomes unavailable. Mark all * affected QP's accordingly. */ static void siw_netdev_down(struct work_struct *work) { struct siw_device *sdev = container_of(work, struct siw_device, netdev_down); struct siw_qp_attrs qp_attrs; struct list_head *pos, *tmp; memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.state = SIW_QP_STATE_ERROR; list_for_each_safe(pos, tmp, &sdev->qp_list) { struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); down_write(&qp->state_lock); WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); up_write(&qp->state_lock); } ib_device_put(&sdev->base_dev); } static void siw_device_goes_down(struct siw_device *sdev) { if (ib_device_try_get(&sdev->base_dev)) { INIT_WORK(&sdev->netdev_down, siw_netdev_down); schedule_work(&sdev->netdev_down); } } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_UP: sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_GOING_DOWN: siw_device_goes_down(sdev); break; case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * Todo: Below netdev events are currently not handled. */ case NETDEV_CHANGEMTU: case NETDEV_CHANGE: break; default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); if (netif_running(netdev) && netif_carrier_ok(netdev)) sdev->state = IB_PORT_ACTIVE; else sdev->state = IB_PORT_DOWN; rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } /* * Locate CRC32 algorithm. If unsuccessful, fail * loading siw only, if CRC is required. */ siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(siw_crypto_shash)) { pr_info("siw: Loading CRC32c failed: %ld\n", PTR_ERR(siw_crypto_shash)); siw_crypto_shash = NULL; if (mpa_crc_required) { rv = -EOPNOTSUPP; goto out_error; } } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw"); |
| 3 3 3 3 3 1 1 1 1 1 1 1 1 1 4 4 3 3 4 4 3 2 2 1 4 2 2 2 2 2 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 | // SPDX-License-Identifier: GPL-2.0-only /* * Driver for the VoIP USB phones with CM109 chipsets. * * Copyright (C) 2007 - 2008 Alfred E. Heggestad <aeh@db.org> */ /* * Tested devices: * - Komunikate KIP1000 * - Genius G-talk * - Allied-Telesis Corega USBPH01 * - ... * * This driver is based on the yealink.c driver * * Thanks to: * - Authors of yealink.c * - Thomas Reitmayr * - Oliver Neukum for good review comments and code * - Shaun Jackman <sjackman@gmail.com> for Genius G-talk keymap * - Dmitry Torokhov for valuable input and review * * Todo: * - Read/write EEPROM */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/rwsem.h> #include <linux/usb/input.h> #define DRIVER_VERSION "20080805" #define DRIVER_AUTHOR "Alfred E. Heggestad" #define DRIVER_DESC "CM109 phone driver" static char *phone = "kip1000"; module_param(phone, charp, S_IRUSR); MODULE_PARM_DESC(phone, "Phone name {kip1000, gtalk, usbph01, atcom}"); enum { /* HID Registers */ HID_IR0 = 0x00, /* Record/Playback-mute button, Volume up/down */ HID_IR1 = 0x01, /* GPI, generic registers or EEPROM_DATA0 */ HID_IR2 = 0x02, /* Generic registers or EEPROM_DATA1 */ HID_IR3 = 0x03, /* Generic registers or EEPROM_CTRL */ HID_OR0 = 0x00, /* Mapping control, buzzer, SPDIF (offset 0x04) */ HID_OR1 = 0x01, /* GPO - General Purpose Output */ HID_OR2 = 0x02, /* Set GPIO to input/output mode */ HID_OR3 = 0x03, /* SPDIF status channel or EEPROM_CTRL */ /* HID_IR0 */ RECORD_MUTE = 1 << 3, PLAYBACK_MUTE = 1 << 2, VOLUME_DOWN = 1 << 1, VOLUME_UP = 1 << 0, /* HID_OR0 */ /* bits 7-6 0: HID_OR1-2 are used for GPO; HID_OR0, 3 are used for buzzer and SPDIF 1: HID_OR0-3 are used as generic HID registers 2: Values written to HID_OR0-3 are also mapped to MCU_CTRL, EEPROM_DATA0-1, EEPROM_CTRL (see Note) 3: Reserved */ HID_OR_GPO_BUZ_SPDIF = 0 << 6, HID_OR_GENERIC_HID_REG = 1 << 6, HID_OR_MAP_MCU_EEPROM = 2 << 6, BUZZER_ON = 1 << 5, /* up to 256 normal keys, up to 15 special key combinations */ KEYMAP_SIZE = 256 + 15, }; /* CM109 protocol packet */ struct cm109_ctl_packet { u8 byte[4]; } __attribute__ ((packed)); enum { USB_PKT_LEN = sizeof(struct cm109_ctl_packet) }; /* CM109 device structure */ struct cm109_dev { struct input_dev *idev; /* input device */ struct usb_device *udev; /* usb device */ struct usb_interface *intf; /* irq input channel */ struct cm109_ctl_packet *irq_data; dma_addr_t irq_dma; struct urb *urb_irq; /* control output channel */ struct cm109_ctl_packet *ctl_data; dma_addr_t ctl_dma; struct usb_ctrlrequest *ctl_req; struct urb *urb_ctl; /* * The 3 bitfields below are protected by ctl_submit_lock. * They have to be separate since they are accessed from IRQ * context. */ unsigned irq_urb_pending:1; /* irq_urb is in flight */ unsigned ctl_urb_pending:1; /* ctl_urb is in flight */ unsigned buzzer_pending:1; /* need to issue buzz command */ spinlock_t ctl_submit_lock; unsigned char buzzer_state; /* on/off */ /* flags */ unsigned open:1; unsigned resetting:1; unsigned shutdown:1; /* This mutex protects writes to the above flags */ struct mutex pm_mutex; unsigned short keymap[KEYMAP_SIZE]; char phys[64]; /* physical device path */ int key_code; /* last reported key */ int keybit; /* 0=new scan 1,2,4,8=scan columns */ u8 gpi; /* Cached value of GPI (high nibble) */ }; /****************************************************************************** * CM109 key interface *****************************************************************************/ static unsigned short special_keymap(int code) { if (code > 0xff) { switch (code - 0xff) { case RECORD_MUTE: return KEY_MICMUTE; case PLAYBACK_MUTE: return KEY_MUTE; case VOLUME_DOWN: return KEY_VOLUMEDOWN; case VOLUME_UP: return KEY_VOLUMEUP; } } return KEY_RESERVED; } /* Map device buttons to internal key events. * * The "up" and "down" keys, are symbolised by arrows on the button. * The "pickup" and "hangup" keys are symbolised by a green and red phone * on the button. Komunikate KIP1000 Keyboard Matrix -> -- 1 -- 2 -- 3 --> GPI pin 4 (0x10) | | | | <- -- 4 -- 5 -- 6 --> GPI pin 5 (0x20) | | | | END - 7 -- 8 -- 9 --> GPI pin 6 (0x40) | | | | OK -- * -- 0 -- # --> GPI pin 7 (0x80) | | | | /|\ /|\ /|\ /|\ | | | | GPO pin: 3 2 1 0 0x8 0x4 0x2 0x1 */ static unsigned short keymap_kip1000(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x14: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x11: return KEY_NUMERIC_3; /* 3 */ case 0x24: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x21: return KEY_NUMERIC_6; /* 6 */ case 0x44: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x41: return KEY_NUMERIC_9; /* 9 */ case 0x81: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x88: return KEY_ENTER; /* pickup */ case 0x48: return KEY_ESC; /* hangup */ case 0x28: return KEY_LEFT; /* IN */ case 0x18: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* Contributed by Shaun Jackman <sjackman@gmail.com> Genius G-Talk keyboard matrix 0 1 2 3 4: 0 4 8 Talk 5: 1 5 9 End 6: 2 6 # Up 7: 3 7 * Down */ static unsigned short keymap_gtalk(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; case 0x21: return KEY_NUMERIC_1; case 0x41: return KEY_NUMERIC_2; case 0x81: return KEY_NUMERIC_3; case 0x12: return KEY_NUMERIC_4; case 0x22: return KEY_NUMERIC_5; case 0x42: return KEY_NUMERIC_6; case 0x82: return KEY_NUMERIC_7; case 0x14: return KEY_NUMERIC_8; case 0x24: return KEY_NUMERIC_9; case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* Talk (green handset) */ case 0x28: return KEY_ESC; /* End (red handset) */ case 0x48: return KEY_UP; /* Menu up (rocker switch) */ case 0x88: return KEY_DOWN; /* Menu down (rocker switch) */ default: return special_keymap(scancode); } } /* * Keymap for Allied-Telesis Corega USBPH01 * http://www.alliedtelesis-corega.com/2/1344/1437/1360/chprd.html * * Contributed by july@nat.bg */ static unsigned short keymap_usbph01(int scancode) { switch (scancode) { case 0x11: return KEY_NUMERIC_0; /* 0 */ case 0x21: return KEY_NUMERIC_1; /* 1 */ case 0x41: return KEY_NUMERIC_2; /* 2 */ case 0x81: return KEY_NUMERIC_3; /* 3 */ case 0x12: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x42: return KEY_NUMERIC_6; /* 6 */ case 0x82: return KEY_NUMERIC_7; /* 7 */ case 0x14: return KEY_NUMERIC_8; /* 8 */ case 0x24: return KEY_NUMERIC_9; /* 9 */ case 0x44: return KEY_NUMERIC_POUND; /* # */ case 0x84: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* IN */ case 0x88: return KEY_RIGHT; /* OUT */ default: return special_keymap(scancode); } } /* * Keymap for ATCom AU-100 * http://www.atcom.cn/products.html * http://www.packetizer.com/products/au100/ * http://www.voip-info.org/wiki/view/AU-100 * * Contributed by daniel@gimpelevich.san-francisco.ca.us */ static unsigned short keymap_atcom(int scancode) { switch (scancode) { /* phone key: */ case 0x82: return KEY_NUMERIC_0; /* 0 */ case 0x11: return KEY_NUMERIC_1; /* 1 */ case 0x12: return KEY_NUMERIC_2; /* 2 */ case 0x14: return KEY_NUMERIC_3; /* 3 */ case 0x21: return KEY_NUMERIC_4; /* 4 */ case 0x22: return KEY_NUMERIC_5; /* 5 */ case 0x24: return KEY_NUMERIC_6; /* 6 */ case 0x41: return KEY_NUMERIC_7; /* 7 */ case 0x42: return KEY_NUMERIC_8; /* 8 */ case 0x44: return KEY_NUMERIC_9; /* 9 */ case 0x84: return KEY_NUMERIC_POUND; /* # */ case 0x81: return KEY_NUMERIC_STAR; /* * */ case 0x18: return KEY_ENTER; /* pickup */ case 0x28: return KEY_ESC; /* hangup */ case 0x48: return KEY_LEFT; /* left arrow */ case 0x88: return KEY_RIGHT; /* right arrow */ default: return special_keymap(scancode); } } static unsigned short (*keymap)(int) = keymap_kip1000; /* * Completes a request by converting the data into events for the * input subsystem. */ static void report_key(struct cm109_dev *dev, int key) { struct input_dev *idev = dev->idev; if (dev->key_code >= 0) { /* old key up */ input_report_key(idev, dev->key_code, 0); } dev->key_code = key; if (key >= 0) { /* new valid key */ input_report_key(idev, key, 1); } input_sync(idev); } /* * Converts data of special key presses (volume, mute) into events * for the input subsystem, sends press-n-release for mute keys. */ static void cm109_report_special(struct cm109_dev *dev) { static const u8 autorelease = RECORD_MUTE | PLAYBACK_MUTE; struct input_dev *idev = dev->idev; u8 data = dev->irq_data->byte[HID_IR0]; unsigned short keycode; int i; for (i = 0; i < 4; i++) { keycode = dev->keymap[0xff + BIT(i)]; if (keycode == KEY_RESERVED) continue; input_report_key(idev, keycode, data & BIT(i)); if (data & autorelease & BIT(i)) { input_sync(idev); input_report_key(idev, keycode, 0); } } input_sync(idev); } /****************************************************************************** * CM109 usb communication interface *****************************************************************************/ static void cm109_submit_buzz_toggle(struct cm109_dev *dev) { int error; if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } /* * IRQ handler */ static void cm109_urb_irq_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; int error; unsigned long flags; dev_dbg(&dev->intf->dev, "### URB IRQ: [0x%02x 0x%02x 0x%02x 0x%02x] keybit=0x%02x\n", dev->irq_data->byte[0], dev->irq_data->byte[1], dev->irq_data->byte[2], dev->irq_data->byte[3], dev->keybit); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); goto out; } /* Special keys */ cm109_report_special(dev); /* Scan key column */ if (dev->keybit == 0xf) { /* Any changes ? */ if ((dev->gpi & 0xf0) == (dev->irq_data->byte[HID_IR1] & 0xf0)) goto out; dev->gpi = dev->irq_data->byte[HID_IR1] & 0xf0; dev->keybit = 0x1; } else { report_key(dev, dev->keymap[dev->irq_data->byte[HID_IR1]]); dev->keybit <<= 1; if (dev->keybit > 0x8) dev->keybit = 0xf; } out: spin_lock_irqsave(&dev->ctl_submit_lock, flags); dev->irq_urb_pending = 0; if (likely(!dev->shutdown)) { if (dev->buzzer_state) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_urb_ctl_callback(struct urb *urb) { struct cm109_dev *dev = urb->context; const int status = urb->status; int error; unsigned long flags; dev_dbg(&dev->intf->dev, "### URB CTL: [0x%02x 0x%02x 0x%02x 0x%02x]\n", dev->ctl_data->byte[0], dev->ctl_data->byte[1], dev->ctl_data->byte[2], dev->ctl_data->byte[3]); if (status) { if (status == -ESHUTDOWN) return; dev_err_ratelimited(&dev->intf->dev, "%s: urb status %d\n", __func__, status); } spin_lock_irqsave(&dev->ctl_submit_lock, flags); dev->ctl_urb_pending = 0; if (likely(!dev->shutdown)) { if (dev->buzzer_pending || status) { dev->buzzer_pending = 0; dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } else if (likely(!dev->irq_urb_pending)) { /* ask for key data */ dev->irq_urb_pending = 1; error = usb_submit_urb(dev->urb_irq, GFP_ATOMIC); if (error) dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_irq) failed %d\n", __func__, error); } } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_toggle_buzzer_async(struct cm109_dev *dev) { unsigned long flags; spin_lock_irqsave(&dev->ctl_submit_lock, flags); if (dev->ctl_urb_pending) { /* URB completion will resubmit */ dev->buzzer_pending = 1; } else { dev->ctl_urb_pending = 1; cm109_submit_buzz_toggle(dev); } spin_unlock_irqrestore(&dev->ctl_submit_lock, flags); } static void cm109_toggle_buzzer_sync(struct cm109_dev *dev, int on) { int error; if (on) dev->ctl_data->byte[HID_OR0] |= BUZZER_ON; else dev->ctl_data->byte[HID_OR0] &= ~BUZZER_ON; error = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), dev->ctl_req->bRequest, dev->ctl_req->bRequestType, le16_to_cpu(dev->ctl_req->wValue), le16_to_cpu(dev->ctl_req->wIndex), dev->ctl_data, USB_PKT_LEN, USB_CTRL_SET_TIMEOUT); if (error < 0 && error != -EINTR) dev_err(&dev->intf->dev, "%s: usb_control_msg() failed %d\n", __func__, error); } static void cm109_stop_traffic(struct cm109_dev *dev) { dev->shutdown = 1; /* * Make sure other CPUs see this */ smp_wmb(); usb_kill_urb(dev->urb_ctl); usb_kill_urb(dev->urb_irq); cm109_toggle_buzzer_sync(dev, 0); dev->shutdown = 0; smp_wmb(); } static void cm109_restore_state(struct cm109_dev *dev) { if (dev->open) { /* * Restore buzzer state. * This will also kick regular URB submission */ cm109_toggle_buzzer_async(dev); } } /****************************************************************************** * input event interface *****************************************************************************/ static int cm109_input_open(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); int error; error = usb_autopm_get_interface(dev->intf); if (error < 0) { dev_err(&idev->dev, "%s - cannot autoresume, result %d\n", __func__, error); return error; } mutex_lock(&dev->pm_mutex); dev->buzzer_state = 0; dev->key_code = -1; /* no keys pressed */ dev->keybit = 0xf; /* issue INIT */ dev->ctl_data->byte[HID_OR0] = HID_OR_GPO_BUZ_SPDIF; dev->ctl_data->byte[HID_OR1] = dev->keybit; dev->ctl_data->byte[HID_OR2] = dev->keybit; dev->ctl_data->byte[HID_OR3] = 0x00; dev->ctl_urb_pending = 1; error = usb_submit_urb(dev->urb_ctl, GFP_KERNEL); if (error) { dev->ctl_urb_pending = 0; dev_err(&dev->intf->dev, "%s: usb_submit_urb (urb_ctl) failed %d\n", __func__, error); } else { dev->open = 1; } mutex_unlock(&dev->pm_mutex); if (error) usb_autopm_put_interface(dev->intf); return error; } static void cm109_input_close(struct input_dev *idev) { struct cm109_dev *dev = input_get_drvdata(idev); mutex_lock(&dev->pm_mutex); /* * Once we are here event delivery is stopped so we * don't need to worry about someone starting buzzer * again */ cm109_stop_traffic(dev); dev->open = 0; mutex_unlock(&dev->pm_mutex); usb_autopm_put_interface(dev->intf); } static int cm109_input_ev(struct input_dev *idev, unsigned int type, unsigned int code, int value) { struct cm109_dev *dev = input_get_drvdata(idev); dev_dbg(&dev->intf->dev, "input_ev: type=%u code=%u value=%d\n", type, code, value); if (type != EV_SND) return -EINVAL; switch (code) { case SND_TONE: case SND_BELL: dev->buzzer_state = !!value; if (!dev->resetting) cm109_toggle_buzzer_async(dev); return 0; default: return -EINVAL; } } /****************************************************************************** * Linux interface and usb initialisation *****************************************************************************/ struct driver_info { char *name; }; static const struct driver_info info_cm109 = { .name = "CM109 USB driver", }; enum { VENDOR_ID = 0x0d8c, /* C-Media Electronics */ PRODUCT_ID_CM109 = 0x000e, /* CM109 defines range 0x0008 - 0x000f */ }; /* table of devices that work with this driver */ static const struct usb_device_id cm109_usb_table[] = { { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = VENDOR_ID, .idProduct = PRODUCT_ID_CM109, .bInterfaceClass = USB_CLASS_HID, .bInterfaceSubClass = 0, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t) &info_cm109 }, /* you can add more devices here with product ID 0x0008 - 0x000f */ { } }; static void cm109_usb_cleanup(struct cm109_dev *dev) { kfree(dev->ctl_req); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->ctl_data, dev->ctl_dma); usb_free_coherent(dev->udev, USB_PKT_LEN, dev->irq_data, dev->irq_dma); usb_free_urb(dev->urb_irq); /* parameter validation in core/urb */ usb_free_urb(dev->urb_ctl); /* parameter validation in core/urb */ kfree(dev); } static void cm109_usb_disconnect(struct usb_interface *interface) { struct cm109_dev *dev = usb_get_intfdata(interface); usb_set_intfdata(interface, NULL); input_unregister_device(dev->idev); cm109_usb_cleanup(dev); } static int cm109_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct driver_info *nfo = (struct driver_info *)id->driver_info; struct usb_host_interface *interface; struct usb_endpoint_descriptor *endpoint; struct cm109_dev *dev; struct input_dev *input_dev = NULL; int ret, pipe, i; int error = -ENOMEM; interface = intf->cur_altsetting; if (interface->desc.bNumEndpoints < 1) return -ENODEV; endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -ENODEV; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; spin_lock_init(&dev->ctl_submit_lock); mutex_init(&dev->pm_mutex); dev->udev = udev; dev->intf = intf; dev->idev = input_dev = input_allocate_device(); if (!input_dev) goto err_out; /* allocate usb buffers */ dev->irq_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->irq_dma); if (!dev->irq_data) goto err_out; dev->ctl_data = usb_alloc_coherent(udev, USB_PKT_LEN, GFP_KERNEL, &dev->ctl_dma); if (!dev->ctl_data) goto err_out; dev->ctl_req = kmalloc(sizeof(*(dev->ctl_req)), GFP_KERNEL); if (!dev->ctl_req) goto err_out; /* allocate urb structures */ dev->urb_irq = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_irq) goto err_out; dev->urb_ctl = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_ctl) goto err_out; /* get a handle to the interrupt data pipe */ pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress); ret = usb_maxpacket(udev, pipe); if (ret != USB_PKT_LEN) dev_err(&intf->dev, "invalid payload size %d, expected %d\n", ret, USB_PKT_LEN); /* initialise irq urb */ usb_fill_int_urb(dev->urb_irq, udev, pipe, dev->irq_data, USB_PKT_LEN, cm109_urb_irq_callback, dev, endpoint->bInterval); dev->urb_irq->transfer_dma = dev->irq_dma; dev->urb_irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_irq->dev = udev; /* initialise ctl urb */ dev->ctl_req->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT; dev->ctl_req->bRequest = USB_REQ_SET_CONFIGURATION; dev->ctl_req->wValue = cpu_to_le16(0x200); dev->ctl_req->wIndex = cpu_to_le16(interface->desc.bInterfaceNumber); dev->ctl_req->wLength = cpu_to_le16(USB_PKT_LEN); usb_fill_control_urb(dev->urb_ctl, udev, usb_sndctrlpipe(udev, 0), (void *)dev->ctl_req, dev->ctl_data, USB_PKT_LEN, cm109_urb_ctl_callback, dev); dev->urb_ctl->transfer_dma = dev->ctl_dma; dev->urb_ctl->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; dev->urb_ctl->dev = udev; /* find out the physical bus location */ usb_make_path(udev, dev->phys, sizeof(dev->phys)); strlcat(dev->phys, "/input0", sizeof(dev->phys)); /* register settings for the input device */ input_dev->name = nfo->name; input_dev->phys = dev->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, dev); input_dev->open = cm109_input_open; input_dev->close = cm109_input_close; input_dev->event = cm109_input_ev; input_dev->keycode = dev->keymap; input_dev->keycodesize = sizeof(unsigned char); input_dev->keycodemax = ARRAY_SIZE(dev->keymap); input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_SND); input_dev->sndbit[0] = BIT_MASK(SND_BELL) | BIT_MASK(SND_TONE); /* register available key events */ for (i = 0; i < KEYMAP_SIZE; i++) { unsigned short k = keymap(i); dev->keymap[i] = k; __set_bit(k, input_dev->keybit); } __clear_bit(KEY_RESERVED, input_dev->keybit); error = input_register_device(dev->idev); if (error) goto err_out; usb_set_intfdata(intf, dev); return 0; err_out: input_free_device(input_dev); cm109_usb_cleanup(dev); return error; } static int cm109_usb_suspend(struct usb_interface *intf, pm_message_t message) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_suspend (event=%d)\n", message.event); mutex_lock(&dev->pm_mutex); cm109_stop_traffic(dev); mutex_unlock(&dev->pm_mutex); return 0; } static int cm109_usb_resume(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev_info(&intf->dev, "cm109: usb_resume\n"); mutex_lock(&dev->pm_mutex); cm109_restore_state(dev); mutex_unlock(&dev->pm_mutex); return 0; } static int cm109_usb_pre_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); mutex_lock(&dev->pm_mutex); /* * Make sure input events don't try to toggle buzzer * while we are resetting */ dev->resetting = 1; smp_wmb(); cm109_stop_traffic(dev); return 0; } static int cm109_usb_post_reset(struct usb_interface *intf) { struct cm109_dev *dev = usb_get_intfdata(intf); dev->resetting = 0; smp_wmb(); cm109_restore_state(dev); mutex_unlock(&dev->pm_mutex); return 0; } static struct usb_driver cm109_driver = { .name = "cm109", .probe = cm109_usb_probe, .disconnect = cm109_usb_disconnect, .suspend = cm109_usb_suspend, .resume = cm109_usb_resume, .reset_resume = cm109_usb_resume, .pre_reset = cm109_usb_pre_reset, .post_reset = cm109_usb_post_reset, .id_table = cm109_usb_table, .supports_autosuspend = 1, }; static int __init cm109_select_keymap(void) { /* Load the phone keymap */ if (!strcasecmp(phone, "kip1000")) { keymap = keymap_kip1000; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Komunikate KIP1000 phone loaded\n"); } else if (!strcasecmp(phone, "gtalk")) { keymap = keymap_gtalk; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Genius G-talk phone loaded\n"); } else if (!strcasecmp(phone, "usbph01")) { keymap = keymap_usbph01; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for Allied-Telesis Corega USBPH01 phone loaded\n"); } else if (!strcasecmp(phone, "atcom")) { keymap = keymap_atcom; printk(KERN_INFO KBUILD_MODNAME ": " "Keymap for ATCom AU-100 phone loaded\n"); } else { printk(KERN_ERR KBUILD_MODNAME ": " "Unsupported phone: %s\n", phone); return -EINVAL; } return 0; } static int __init cm109_init(void) { int err; err = cm109_select_keymap(); if (err) return err; err = usb_register(&cm109_driver); if (err) return err; printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_DESC ": " DRIVER_VERSION " (C) " DRIVER_AUTHOR "\n"); return 0; } static void __exit cm109_exit(void) { usb_deregister(&cm109_driver); } module_init(cm109_init); module_exit(cm109_exit); MODULE_DEVICE_TABLE(usb, cm109_usb_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); |
| 7 7 7 7 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 | // SPDX-License-Identifier: GPL-2.0-only /* * vhost transport for vsock * * Copyright (C) 2013-2015 Red Hat, Inc. * Author: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/miscdevice.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <net/sock.h> #include <linux/virtio_vsock.h> #include <linux/vhost.h> #include <linux/hashtable.h> #include <net/af_vsock.h> #include "vhost.h" #define VHOST_VSOCK_DEFAULT_HOST_CID 2 /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_VSOCK_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with * small pkts. */ #define VHOST_VSOCK_PKT_WEIGHT 256 enum { VHOST_VSOCK_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_VSOCK_F_SEQPACKET) }; enum { VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; /* Used to track all the vhost_vsock instances on the system. */ static DEFINE_MUTEX(vhost_vsock_mutex); static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8); struct vhost_vsock { struct vhost_dev dev; struct vhost_virtqueue vqs[2]; /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */ struct hlist_node hash; struct vhost_work send_pkt_work; struct sk_buff_head send_pkt_queue; /* host->guest pending packets */ atomic_t queued_replies; u32 guest_cid; bool seqpacket_allow; }; static u32 vhost_transport_get_local_cid(void) { return VHOST_VSOCK_DEFAULT_HOST_CID; } /* Callers that dereference the return value must hold vhost_vsock_mutex or the * RCU read lock. */ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid) { struct vhost_vsock *vsock; hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) { u32 other_cid = vsock->guest_cid; /* Skip instances that have no CID yet */ if (other_cid == 0) continue; if (other_cid == guest_cid) return vsock; } return NULL; } static void vhost_transport_do_send_pkt(struct vhost_vsock *vsock, struct vhost_virtqueue *vq) { struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; int pkts = 0, total_len = 0; bool added = false; bool restart_tx = false; mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq)) goto out; if (!vq_meta_prefetch(vq)) goto out; /* Avoid further vmexits, we're already processing the virtqueue */ vhost_disable_notify(&vsock->dev, vq); do { struct virtio_vsock_hdr *hdr; size_t iov_len, payload_len; struct iov_iter iov_iter; u32 flags_to_restore = 0; struct sk_buff *skb; unsigned out, in; size_t nbytes; u32 offset; int head; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); if (!skb) { vhost_enable_notify(&vsock->dev, vq); break; } head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); if (head < 0) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } if (head == vq->num) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); /* We cannot finish yet if more buffers snuck in while * re-enabling notify. */ if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { vhost_disable_notify(&vsock->dev, vq); continue; } break; } if (out) { kfree_skb(skb); vq_err(vq, "Expected 0 output buffers, got %u\n", out); break; } iov_len = iov_length(&vq->iov[out], in); if (iov_len < sizeof(*hdr)) { kfree_skb(skb); vq_err(vq, "Buffer len [%zu] too small\n", iov_len); break; } iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len); offset = VIRTIO_VSOCK_SKB_CB(skb)->offset; payload_len = skb->len - offset; hdr = virtio_vsock_hdr(skb); /* If the packet is greater than the space available in the * buffer, we split it using multiple buffers. */ if (payload_len > iov_len - sizeof(*hdr)) { payload_len = iov_len - sizeof(*hdr); /* As we are copying pieces of large packet's buffer to * small rx buffers, headers of packets in rx queue are * created dynamically and are initialized with header * of current packet(except length). But in case of * SOCK_SEQPACKET, we also must clear message delimeter * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise, * there will be sequence of packets with these * bits set. After initialized header will be copied to * rx buffer, these required bits will be restored. */ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) { hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR; } } } /* Set the correct length in the header */ hdr->len = cpu_to_le32(payload_len); nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter); if (nbytes != sizeof(*hdr)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt hdr\n"); break; } if (skb_copy_datagram_iter(skb, offset, &iov_iter, payload_len)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt buf\n"); break; } /* Deliver to monitoring devices all packets that we * will transmit. */ virtio_transport_deliver_tap_pkt(skb); vhost_add_used(vq, head, sizeof(*hdr) + payload_len); added = true; VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len; total_len += payload_len; /* If we didn't send all the payload we can requeue the packet * to send it with the next available buffer. */ if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) { hdr->flags |= cpu_to_le32(flags_to_restore); /* We are queueing the same skb to handle * the remaining bytes, and we want to deliver it * to monitoring devices in the next iteration. */ virtio_vsock_skb_clear_tap_delivered(skb); virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); } else { if (virtio_vsock_skb_reply(skb)) { int val; val = atomic_dec_return(&vsock->queued_replies); /* Do we have resources to resume tx * processing? */ if (val + 1 == tx_vq->num) restart_tx = true; } consume_skb(skb); } } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); if (added) vhost_signal(&vsock->dev, vq); out: mutex_unlock(&vq->mutex); if (restart_tx) vhost_poll_queue(&tx_vq->poll); } static void vhost_transport_send_pkt_work(struct vhost_work *work) { struct vhost_virtqueue *vq; struct vhost_vsock *vsock; vsock = container_of(work, struct vhost_vsock, send_pkt_work); vq = &vsock->vqs[VSOCK_VQ_RX]; vhost_transport_do_send_pkt(vsock, vq); } static int vhost_transport_send_pkt(struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vhost_vsock *vsock; int len = skb->len; rcu_read_lock(); /* Find the vhost_vsock according to guest context id */ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid)); if (!vsock) { rcu_read_unlock(); kfree_skb(skb); return -ENODEV; } if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work); rcu_read_unlock(); return len; } static int vhost_transport_cancel_pkt(struct vsock_sock *vsk) { struct vhost_vsock *vsock; int cnt = 0; int ret = -ENODEV; rcu_read_lock(); /* Find the vhost_vsock according to guest context id */ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid); if (!vsock) goto out; cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); if (cnt) { struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; int new_cnt; new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num) vhost_poll_queue(&tx_vq->poll); } ret = 0; out: rcu_read_unlock(); return ret; } static struct sk_buff * vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, unsigned int out, unsigned int in) { struct virtio_vsock_hdr *hdr; struct iov_iter iov_iter; struct sk_buff *skb; size_t payload_len; size_t nbytes; size_t len; if (in != 0) { vq_err(vq, "Expected 0 input buffers, got %u\n", in); return NULL; } len = iov_length(vq->iov, out); /* len contains both payload and hdr */ skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); if (!skb) return NULL; iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len); hdr = virtio_vsock_hdr(skb); nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter); if (nbytes != sizeof(*hdr)) { vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n", sizeof(*hdr), nbytes); kfree_skb(skb); return NULL; } payload_len = le32_to_cpu(hdr->len); /* No payload */ if (!payload_len) return skb; /* The pkt is too big or the length in the header is invalid */ if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || payload_len + sizeof(*hdr) > len) { kfree_skb(skb); return NULL; } virtio_vsock_skb_rx_put(skb); nbytes = copy_from_iter(skb->data, payload_len, &iov_iter); if (nbytes != payload_len) { vq_err(vq, "Expected %zu byte payload, got %zu bytes\n", payload_len, nbytes); kfree_skb(skb); return NULL; } return skb; } /* Is there space left for replies to rx packets? */ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock) { struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX]; int val; smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ val = atomic_read(&vsock->queued_replies); return val < vq->num; } static bool vhost_transport_msgzerocopy_allow(void) { return true; } static bool vhost_transport_seqpacket_allow(u32 remote_cid); static struct virtio_transport vhost_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = vhost_transport_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = vhost_transport_cancel_pkt, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_bind = virtio_transport_dgram_bind, .dgram_allow = virtio_transport_dgram_allow, .stream_enqueue = virtio_transport_stream_enqueue, .stream_dequeue = virtio_transport_stream_dequeue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = vhost_transport_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = vhost_transport_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .read_skb = virtio_transport_read_skb, }, .send_pkt = vhost_transport_send_pkt, }; static bool vhost_transport_seqpacket_allow(u32 remote_cid) { struct vhost_vsock *vsock; bool seqpacket_allow = false; rcu_read_lock(); vsock = vhost_vsock_get(remote_cid); if (vsock) seqpacket_allow = vsock->seqpacket_allow; rcu_read_unlock(); return seqpacket_allow; } static void vhost_vsock_handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, dev); int head, pkts = 0, total_len = 0; unsigned int out, in; struct sk_buff *skb; bool added = false; mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq)) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&vsock->dev, vq); do { struct virtio_vsock_hdr *hdr; if (!vhost_vsock_more_replies(vsock)) { /* Stop tx until the device processes already * pending replies. Leave tx virtqueue * callbacks disabled. */ goto no_more_replies; } head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); if (head < 0) break; if (head == vq->num) { if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { vhost_disable_notify(&vsock->dev, vq); continue; } break; } skb = vhost_vsock_alloc_skb(vq, out, in); if (!skb) { vq_err(vq, "Faulted on pkt\n"); continue; } total_len += sizeof(*hdr) + skb->len; /* Deliver to monitoring devices all received packets */ virtio_transport_deliver_tap_pkt(skb); hdr = virtio_vsock_hdr(skb); /* Only accept correctly addressed packets */ if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid && le64_to_cpu(hdr->dst_cid) == vhost_transport_get_local_cid()) virtio_transport_recv_pkt(&vhost_transport, skb); else kfree_skb(skb); vhost_add_used(vq, head, 0); added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); no_more_replies: if (added) vhost_signal(&vsock->dev, vq); out: mutex_unlock(&vq->mutex); } static void vhost_vsock_handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, dev); vhost_transport_do_send_pkt(vsock, vq); } static int vhost_vsock_start(struct vhost_vsock *vsock) { struct vhost_virtqueue *vq; size_t i; int ret; mutex_lock(&vsock->dev.mutex); ret = vhost_dev_check_owner(&vsock->dev); if (ret) goto err; for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); if (!vhost_vq_access_ok(vq)) { ret = -EFAULT; goto err_vq; } if (!vhost_vq_get_backend(vq)) { vhost_vq_set_backend(vq, vsock); ret = vhost_vq_init_access(vq); if (ret) goto err_vq; } mutex_unlock(&vq->mutex); } /* Some packets may have been queued before the device was started, * let's kick the send worker to send them. */ vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work); mutex_unlock(&vsock->dev.mutex); return 0; err_vq: vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); } err: mutex_unlock(&vsock->dev.mutex); return ret; } static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner) { size_t i; int ret = 0; mutex_lock(&vsock->dev.mutex); if (check_owner) { ret = vhost_dev_check_owner(&vsock->dev); if (ret) goto err; } for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { struct vhost_virtqueue *vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); } err: mutex_unlock(&vsock->dev.mutex); return ret; } static void vhost_vsock_free(struct vhost_vsock *vsock) { kvfree(vsock); } static int vhost_vsock_dev_open(struct inode *inode, struct file *file) { struct vhost_virtqueue **vqs; struct vhost_vsock *vsock; int ret; /* This struct is large and allocation could fail, fall back to vmalloc * if there is no other way. */ vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!vsock) return -ENOMEM; vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL); if (!vqs) { ret = -ENOMEM; goto out; } vsock->guest_cid = 0; /* no CID assigned yet */ atomic_set(&vsock->queued_replies, 0); vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX]; vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX]; vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT, VHOST_VSOCK_WEIGHT, true, NULL); file->private_data = vsock; skb_queue_head_init(&vsock->send_pkt_queue); vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work); return 0; out: vhost_vsock_free(vsock); return ret; } static void vhost_vsock_flush(struct vhost_vsock *vsock) { vhost_dev_flush(&vsock->dev); } static void vhost_vsock_reset_orphans(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); /* vmci_transport.c doesn't take sk_lock here either. At least we're * under vsock_table_lock so the sock cannot disappear while we're * executing. */ /* If the peer is still valid, no need to reset connection */ if (vhost_vsock_get(vsk->remote_addr.svm_cid)) return; /* If the close timeout is pending, let it expire. This avoids races * with the timeout callback. */ if (vsk->close_work_scheduled) return; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; sk_error_report(sk); } static int vhost_vsock_dev_release(struct inode *inode, struct file *file) { struct vhost_vsock *vsock = file->private_data; mutex_lock(&vhost_vsock_mutex); if (vsock->guest_cid) hash_del_rcu(&vsock->hash); mutex_unlock(&vhost_vsock_mutex); /* Wait for other CPUs to finish using vsock */ synchronize_rcu(); /* Iterating over all connections for all CIDs to find orphans is * inefficient. Room for improvement here. */ vsock_for_each_connected_socket(&vhost_transport.transport, vhost_vsock_reset_orphans); /* Don't check the owner, because we are in the release path, so we * need to stop the vsock device in any case. * vhost_vsock_stop() can not fail in this case, so we don't need to * check the return code. */ vhost_vsock_stop(vsock, false); vhost_vsock_flush(vsock); vhost_dev_stop(&vsock->dev); virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); vhost_dev_cleanup(&vsock->dev); kfree(vsock->dev.vqs); vhost_vsock_free(vsock); return 0; } static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid) { struct vhost_vsock *other; /* Refuse reserved CIDs */ if (guest_cid <= VMADDR_CID_HOST || guest_cid == U32_MAX) return -EINVAL; /* 64-bit CIDs are not yet supported */ if (guest_cid > U32_MAX) return -EINVAL; /* Refuse if CID is assigned to the guest->host transport (i.e. nested * VM), to make the loopback work. */ if (vsock_find_cid(guest_cid)) return -EADDRINUSE; /* Refuse if CID is already in use */ mutex_lock(&vhost_vsock_mutex); other = vhost_vsock_get(guest_cid); if (other && other != vsock) { mutex_unlock(&vhost_vsock_mutex); return -EADDRINUSE; } if (vsock->guest_cid) hash_del_rcu(&vsock->hash); vsock->guest_cid = guest_cid; hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid); mutex_unlock(&vhost_vsock_mutex); return 0; } static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) { struct vhost_virtqueue *vq; int i; if (features & ~VHOST_VSOCK_FEATURES) return -EOPNOTSUPP; mutex_lock(&vsock->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&vsock->dev)) { goto err; } if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&vsock->dev)) goto err; } if (features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET)) vsock->seqpacket_allow = true; for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vq->acked_features = features; mutex_unlock(&vq->mutex); } mutex_unlock(&vsock->dev.mutex); return 0; err: mutex_unlock(&vsock->dev.mutex); return -EFAULT; } static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_vsock *vsock = f->private_data; void __user *argp = (void __user *)arg; u64 guest_cid; u64 features; int start; int r; switch (ioctl) { case VHOST_VSOCK_SET_GUEST_CID: if (copy_from_user(&guest_cid, argp, sizeof(guest_cid))) return -EFAULT; return vhost_vsock_set_cid(vsock, guest_cid); case VHOST_VSOCK_SET_RUNNING: if (copy_from_user(&start, argp, sizeof(start))) return -EFAULT; if (start) return vhost_vsock_start(vsock); else return vhost_vsock_stop(vsock, true); case VHOST_GET_FEATURES: features = VHOST_VSOCK_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; return vhost_vsock_set_features(vsock, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_VSOCK_BACKEND_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; if (features & ~VHOST_VSOCK_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&vsock->dev, features); return 0; default: mutex_lock(&vsock->dev.mutex); r = vhost_dev_ioctl(&vsock->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&vsock->dev, ioctl, argp); else vhost_vsock_flush(vsock); mutex_unlock(&vsock->dev.mutex); return r; } } static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait) { struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_vsock_fops = { .owner = THIS_MODULE, .open = vhost_vsock_dev_open, .release = vhost_vsock_dev_release, .llseek = noop_llseek, .unlocked_ioctl = vhost_vsock_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, .read_iter = vhost_vsock_chr_read_iter, .write_iter = vhost_vsock_chr_write_iter, .poll = vhost_vsock_chr_poll, }; static struct miscdevice vhost_vsock_misc = { .minor = VHOST_VSOCK_MINOR, .name = "vhost-vsock", .fops = &vhost_vsock_fops, }; static int __init vhost_vsock_init(void) { int ret; ret = vsock_core_register(&vhost_transport.transport, VSOCK_TRANSPORT_F_H2G); if (ret < 0) return ret; ret = misc_register(&vhost_vsock_misc); if (ret) { vsock_core_unregister(&vhost_transport.transport); return ret; } return 0; }; static void __exit vhost_vsock_exit(void) { misc_deregister(&vhost_vsock_misc); vsock_core_unregister(&vhost_transport.transport); }; module_init(vhost_vsock_init); module_exit(vhost_vsock_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("vhost transport for vsock "); MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR); MODULE_ALIAS("devname:vhost-vsock"); |
| 814 78 78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM csd #if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CSD_H #include <linux/tracepoint.h> TRACE_EVENT(csd_queue_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, smp_call_func_t func, call_single_data_t *csd), TP_ARGS(cpu, callsite, func, csd), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->func = func; __entry->csd = csd; ), TP_printk("cpu=%u callsite=%pS func=%ps csd=%p", __entry->cpu, __entry->callsite, __entry->func, __entry->csd) ); /* * Tracepoints for a function which is called as an effect of smp_call_function.* */ DECLARE_EVENT_CLASS(csd_function, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd), TP_STRUCT__entry( __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->func = func; __entry->csd = csd; ), TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd) ); DEFINE_EVENT(csd_function, csd_function_entry, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); DEFINE_EVENT(csd_function, csd_function_exit, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); #endif /* _TRACE_CSD_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 1354 1171 1349 1350 797 796 796 797 797 797 797 796 796 662 658 664 662 1347 1355 1350 1350 1346 1347 797 797 797 1174 1178 1166 1169 58 58 58 58 58 58 58 58 58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 | // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support four policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * preferred many Try a set of nodes first before normal fallback. This is * similar to preferred without the special case. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <linux/uaccess.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /** * numa_nearest_node - Find nearest node by state * @node: Node id to start the search * @state: State to filter the search * * Lookup the closest node by distance if @nid is not in state. * * Return: this @node if it is in state, otherwise the closest node by distance */ int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; if (state >= NR_NODE_STATES) return -EINVAL; if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_nearest_node); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; nodes_clear(pol->nodes); node_set(first_node(*nodes), pol->nodes); return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* * Default (pol==NULL) resp. local memory policies are not a * subject of any remapping. They also do not need any special * constructor. */ if (!pol || pol->mode == MPOL_LOCAL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); mode = MPOL_LOCAL; } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *pol) { if (!atomic_dec_and_test(&pol->refcnt)) return; kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { pol->w.cpuset_mems_allowed = *nodes; } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol || pol->mode == MPOL_LOCAL) return; if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); for_each_vma(vmi, vma) { vma_start_write(vma); mpol_rebind_policy(vma->vm_policy, new); } mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, }; static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { /* * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO * if any misplaced page is found. */ return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == MPOL_MF_STRICT; } struct migration_mpol { /* for alloc_migration_target_by_mpol() */ struct mempolicy *pol; pgoff_t ilx; }; struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; struct folio *large; /* note last large folio encountered */ long nr_failed; /* could not be isolated at this time */ }; /* * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { struct folio *folio; struct queue_pages *qp = walk->private; if (unlikely(is_pmd_migration_entry(*pmd))) { qp->nr_failed++; return; } folio = pfn_folio(pmd_pfn(*pmd)); if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; return; } if (!queue_folio_required(folio, qp)) return; if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma) || !migrate_folio_add(folio, qp->pagelist, qp->flags)) qp->nr_failed++; } /* * Scan through folios, checking if they satisfy the required conditions, * moving them from LRU to local pagelist for migration if they do (or not). * * queue_folios_pte_range() has two possible return values: * 0 - continue walking to scan for more, even if an existing folio on the * wrong node could not be isolated and queued for migration. * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { queue_folios_pmd(pmd, walk); spin_unlock(ptl); goto out; } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (pte_none(ptent)) continue; if (!pte_present(ptent)) { if (is_migration_entry(pte_to_swp_entry(ptent))) qp->nr_failed++; continue; } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. */ if (folio_test_reserved(folio)) continue; if (!queue_folio_required(folio, qp)) continue; if (folio_test_large(folio)) { /* * A large folio can only be isolated from LRU once, * but may be mapped by many PTEs (and Copy-On-Write may * intersperse PTEs of other, order 0, folios). This is * a common case, so don't mistake it for failure (but * there can be other cases of multi-mapped pages which * this quick check does not help to filter out - and a * search of the pagelist might grow to be prohibitive). * * migrate_pages(&pagelist) returns nr_failed folios, so * check "large" now so that queue_pages_range() returns * a comparable nr_failed folios. This does imply that * if folio could not be isolated for some racy reason * at its first PTE, later PTEs will not give it another * chance of isolation; but keeps the accounting simple. */ if (folio == qp->large) continue; qp->large = folio; } if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(vma) || !migrate_folio_add(folio, qp->pagelist, flags)) { qp->nr_failed++; if (strictly_unmovable(flags)) break; } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); out: if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); if (!pte_present(entry)) { if (unlikely(is_hugetlb_entry_migration(entry))) qp->nr_failed++; goto unlock; } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || !vma_migratable(walk->vma)) { qp->nr_failed++; goto unlock; } /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very * expensive, so check the estimated sharers of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) if (!isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); if (qp->nr_failed && strictly_unmovable(flags)) return -EIO; #endif return 0; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mmu_gather tlb; long nr_updated; tlb_gather_mmu(&tlb, vma->vm_mm); nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA); if (nr_updated > 0) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); tlb_finish_mmu(&tlb); return nr_updated; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) endvma = end; /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. */ if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, .walk_lock = PGWALK_WRLOCK, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are not on the required set of @nodes, * and migration is allowed, they are isolated and queued to @pagelist. * * queue_pages_range() may return: * 0 - all pages already on the right node, or successfully queued for moving * (or neither strict checking nor moving requested: only range checking). * >0 - this number of misplaced folios could not be queued for moving * (a hugetlbfs page or a transparent huge page being counted as 1). * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err ? : qp.nr_failed; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; vma_assert_write_locked(vma); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Split or merge the VMA (if required) and apply the new policy */ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { unsigned long vmstart, vmend; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { *prev = vma; vmstart = start; } else { vmstart = vma->vm_start; } if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); if (IS_ERR(vma)) return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); if (pol == &default_policy) return; switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int ret; ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); if (ret > 0) { ret = page_to_nid(p); put_page(p); } return ret; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, because we are about to * drop the mmap_lock, after which only "pol" remains * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very * expensive, so check the estimated sharers of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, foliolist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. */ return false; } } return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static long migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); long nr_failed; long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, }; nodes_clear(nmask); node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); mmap_read_lock(mm); vma = find_vma(mm, 0); /* * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, * but passes back the count of pages which could not be isolated. */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } if (err >= 0) err += nr_failed; return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { long nr_failed = 0; long err = 0; nodemask_t tmp; lru_cache_disable(); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s, d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) nr_failed += err; if (err < 0) break; } lru_cache_enable(); if (err < 0) return err; return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* * Allocate a new folio for page migration, according to NUMA mempolicy. */ static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; struct page *page; unsigned int order; int nid = numa_node_id(); gfp_t gfp; order = folio_order(src); ilx += src->index >> order; if (folio_test_hugetlb(src)) { nodemask_t *nodemask; struct hstate *h; h = folio_hstate(src); gfp = htlb_alloc_mask(h); nodemask = policy_nodemask(gfp, pol, ilx, &nid); return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; page = alloc_pages_mpol(gfp, order, pol, ilx, nid); return page_rmappable_folio(page); } #else static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; /* * Lock the VMAs before scanning for pages to migrate, * to ensure we don't miss a concurrently inserted page. */ nr_failed = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); if (nr_failed < 0) { err = nr_failed; nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { err = mbind_range(&vmi, vma, &prev, start, end, new); if (err) break; } } if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } mmpol.pol = new; mmpol.ilx = 0; /* * In the interleaved case, attempt to allocate on exactly the * targeted nodes, for the first VMA to be migrated; for later * VMAs, the nodes will still be interleaved from the targeted * nodemask, but one by one may be selected differently. */ if (new->mode == MPOL_INTERLEAVE) { struct page *page; unsigned int order; unsigned long addr = -EFAULT; list_for_each_entry(page, &pagelist, lru) { if (!PageKsm(page)) break; } if (!list_entry_is_head(page, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { addr = page_address_in_vma(page, vma); if (addr != -EFAULT) break; } } if (addr != -EFAULT) { order = compound_order(page); /* We already know the pol, but not the ilx */ mpol_cond_put(get_vma_policy(vma, addr, order, &mmpol.ilx)); /* Set base from which to increment by index */ mmpol.ilx -= page->index >> order; } } } mmap_write_unlock(mm); if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_enable(); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long nlongs = BITS_TO_LONGS(maxnode); int ret; if (in_compat_syscall()) ret = compat_get_bitmap(mask, (const compat_ulong_t __user *)nmask, maxnode); else ret = copy_from_user(mask, nmask, nlongs * sizeof(unsigned long)); if (ret) return -EFAULT; if (maxnode % BITS_PER_LONG) mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; return 0; } /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; /* * When the user specified more nodes than supported just check * if the non supported part is all zero, one word at a time, * starting at the end. */ while (maxnode > MAX_NUMNODES) { unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { maxnode -= bits; } else { maxnode = MAX_NUMNODES; t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); } if (t) return -EINVAL; } return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); bool compat = in_compat_syscall(); if (compat) nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; maxnode = nr_node_ids; } if (compat) return compat_put_bitmap((compat_ulong_t __user *)mask, nodes_addr(*nodes), maxnode); return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; if (*flags & MPOL_F_NUMA_BALANCING) { if (*mode != MPOL_BIND) return -EINVAL; *flags |= (MPOL_F_MOF | MPOL_F_MORON); } return 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; start = untagged_addr(start); err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, unsigned long, home_node, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; unsigned long end; int err = -ENOENT; VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) return -EINVAL; /* * flags is used for future extension if any. */ if (flags != 0) return -EINVAL; /* * Check home_node is online to avoid accessing uninitialized * NODE_DATA. */ if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; len = PAGE_ALIGN(len); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; mmap_write_lock(mm); prev = vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { /* * If any vma in the range got policy other than MPOL_BIND * or MPOL_PREFERRED_MANY we return error. We don't reset * the home node for vmas we already updated before. */ old = vma_policy(vma); if (!old) { prev = vma; continue; } if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) { err = -EOPNOTSUPP; break; } new = mpol_dup(old); if (IS_ERR(new)) { err = PTR_ERR(new); break; } vma_start_write(vma); new->home_node = home_node; err = mbind_range(&vmi, vma, &prev, start, end, new); mpol_put(new); if (err) break; } mmap_write_unlock(mm); return err; } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { unsigned short mode_flags; nodemask_t nodes; int lmode = mode; int err; err = sanitize_mpol_flags(&lmode, &mode_flags); if (err) return err; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(lmode, mode_flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { *ilx = 0; return (vma->vm_ops && vma->vm_ops->get_policy) ? vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * @order: 0, or appropriate huge_page_order for interleaving * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { struct mempolicy *pol; pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) { *ilx += vma->vm_pgoff >> order; *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); } return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pgoff_t ilx; /* ignored here */ pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->nodes is intersect with node_states[N_MEMORY]. * so if the following test fails, it implies * policy->nodes has movable memory only. */ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { unsigned int nid; nid = next_node_in(current->il_prev, policy->nodes); if (nid < MAX_NUMNODES) current->il_prev = nid; return nid; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (!in_task()) return node; policy = current->mempolicy; if (!policy) return node; switch (policy->mode) { case MPOL_PREFERRED: return first_node(policy->nodes); case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: case MPOL_PREFERRED_MANY: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->nodes); return z->zone ? zone_to_nid(z->zone) : node; } case MPOL_LOCAL: return node; default: BUG(); } } /* * Do static interleaving for interleave index @ilx. Returns the ilx'th * node in pol->nodes (starting from ilx=0), wrapping around if ilx * exceeds the number of present nodes. */ static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask = pol->nodes; unsigned int target, nnodes; int i; int nid; /* * The barrier will stabilize the nodemask in a register or on * the stack so that it will stop changing under the code. * * Between first_node() and next_node(), pol->nodes could be changed * by other threads. So we put pol->nodes in a local stack. */ barrier(); nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation, together with preferred node id (or the input node id). */ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid) { nodemask_t *nodemask = NULL; switch (pol->mode) { case MPOL_PREFERRED: /* Override input node id */ *nid = first_node(pol->nodes); break; case MPOL_PREFERRED_MANY: nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; break; case MPOL_BIND: /* Restrict to nodemask (but not on lower zones) */ if (apply_policy_zone(pol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&pol->nodes)) nodemask = &pol->nodes; if (pol->home_node != NUMA_NO_NODE) *nid = pol->home_node; /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(gfp & __GFP_THISNODE); break; case MPOL_INTERLEAVE: /* Override input node id */ *nid = (ilx == NO_INTERLEAVE_INDEX) ? interleave_nodes(pol) : interleave_nid(pol, ilx); break; } return nodemask; } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { pgoff_t ilx; int nid; nid = numa_node_id(); *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->nodes; break; case MPOL_LOCAL: init_nodemask_of_node(mask, numa_node_id()); break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_in_oom_domain * * If tsk's mempolicy is "bind", check for intersection between mask and * the policy nodemask. Otherwise, return true for all other policies * including "interleave", as a tsk with "interleave" policy may have * memory allocated from all nodes in system. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (mempolicy && mempolicy->mode == MPOL_BIND) ret = nodes_intersects(mempolicy->nodes, *mask); task_unlock(tsk); return ret; } static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; /* * This is a two pass approach. The first pass will only try the * preferred nodes but skip the direct reclaim and allow the * allocation to fail, while the second pass will try all the * nodes in system. */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages(gfp, order, nid, NULL); return page; } /** * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. * @order: Order of the page allocation. * @pol: Pointer to the NUMA mempolicy. * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; struct page *page; nodemask = policy_nodemask(gfp, pol, ilx, &nid); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_preferred_many(gfp, order, nid, nodemask); if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && /* filter "hugepage" allocation, unless from alloc_pages() */ order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode != MPOL_INTERLEAVE && (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_pages_node(nid, gfp | __GFP_THISNODE | __GFP_NORETRY, order); if (page || !(gfp & __GFP_DIRECT_RECLAIM)) return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ } } page = __alloc_pages(gfp, order, nid, nodemask); if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ if (static_branch_likely(&vm_numa_stat_key) && page_to_nid(page) == nid) { preempt_disable(); __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } } return page; } /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. * @hugepage: Unused (was: For hugepages try only preferred node if possible). * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and * excepting where direct use of alloc_pages_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; pgoff_t ilx; struct page *page; pol = get_vma_policy(vma, addr, order, &ilx); page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio); /** * alloc_pages - Allocate pages. * @gfp: GFP flags. * @order: Power of two of number of pages to allocate. * * Allocate 1 << @order contiguous pages. The physical address of the * first page is naturally aligned (eg an order-3 allocation will be aligned * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current * process is honoured when in process context. * * Context: Can be called from any context, providing the appropriate GFP * flags are used. * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages); struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc); static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { int nodes; unsigned long nr_pages_per_node; int delta; int i; unsigned long nr_allocated; unsigned long total_allocated = 0; nodes = nodes_weight(pol->nodes); nr_pages_per_node = nr_pages / nodes; delta = nr_pages - nodes * nr_pages_per_node; for (i = 0; i < nodes; i++) { if (delta) { nr_allocated = __alloc_pages_bulk(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, NULL, page_array); delta--; } else { nr_allocated = __alloc_pages_bulk(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, NULL, page_array); } page_array += nr_allocated; total_allocated += nr_allocated; } return total_allocated; } static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, struct mempolicy *pol, unsigned long nr_pages, struct page **page_array) { gfp_t preferred_gfp; unsigned long nr_allocated = 0; preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, nr_pages, NULL, page_array); if (nr_allocated < nr_pages) nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, NULL, page_array + nr_allocated); return nr_allocated; } /* alloc pages bulk and mempolicy should be considered at the * same time in some situation such as vmalloc. * * It can accelerate memory allocation especially interleaving * allocate memory. */ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; nodemask_t *nodemask; int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); if (pol->mode == MPOL_INTERLEAVE) return alloc_pages_bulk_array_interleave(gfp, pol, nr_pages, page_array); if (pol->mode == MPOL_PREFERRED_MANY) return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); nid = numa_node_id(); nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); return __alloc_pages_bulk(gfp, nid, nodemask, nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (a->home_node != b->home_node) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node *sp_lookup(struct shared_policy *sp, pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); } /* Find shared policy intersecting idx */ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current folio node is valid in policy * * @folio: folio to be checked * @vma: vm area where folio mapped * @addr: virtual address in @vma for shared policy lookup and interleave policy * * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this * policy, or a suitable node ID to allocate a replacement folio from. */ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: polnid = interleave_nid(pol, ilx); break; case MPOL_PREFERRED: if (node_isset(curnid, pol->nodes)) goto out; polnid = first_node(pol->nodes); break; case MPOL_LOCAL: polnid = numa_node_id(); break; case MPOL_BIND: /* Optimize placement among multiple nodes via NUMA balancing */ if (pol->flags & MPOL_F_MORON) { if (node_isset(thisnid, pol->nodes)) break; goto out; } fallthrough; case MPOL_PREFERRED_MANY: /* * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->nodes)) goto out; z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->nodes); polnid = zone_to_nid(z->zone); break; default: BUG(); } /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; atomic_set(&mpol_new->refcnt, 1); goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct sp_node *sn; struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy to this file */ npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_npol; /* alloc node covering entire file; adds ref to file's npol */ sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); if (sn) sp_insert(sp, sn); put_npol: mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); if (pol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; if (!sp->root.rb_node) return; write_lock(&sp->lock); next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(sp, n); } write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", [MPOL_PREFERRED_MANY] = "prefer (many)", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) { new->nodes = nodes; } else if (nodelist) { nodes_clear(new->nodes); node_set(first_node(nodes), new->nodes); } else { new->mode = MPOL_LOCAL; } /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the * longest flag, "relative", and to display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: case MPOL_LOCAL: break; case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } |
| 1 1 1 1 1 1 2 2 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 | // SPDX-License-Identifier: GPL-2.0+ /* * drivers/usb/class/usbtmc.c - USB Test & Measurement class driver * * Copyright (C) 2007 Stefan Kopp, Gechingen, Germany * Copyright (C) 2008 Novell, Inc. * Copyright (C) 2008 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (C) 2018 IVI Foundation, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/kref.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/mutex.h> #include <linux/usb.h> #include <linux/compat.h> #include <linux/usb/tmc.h> /* Increment API VERSION when changing tmc.h with new flags or ioctls * or when changing a significant behavior of the driver. */ #define USBTMC_API_VERSION (3) #define USBTMC_HEADER_SIZE 12 #define USBTMC_MINOR_BASE 176 /* Minimum USB timeout (in milliseconds) */ #define USBTMC_MIN_TIMEOUT 100 /* Default USB timeout (in milliseconds) */ #define USBTMC_TIMEOUT 5000 /* Max number of urbs used in write transfers */ #define MAX_URBS_IN_FLIGHT 16 /* I/O buffer size used in generic read/write functions */ #define USBTMC_BUFSIZE (4096) /* * Maximum number of read cycles to empty bulk in endpoint during CLEAR and * ABORT_BULK_IN requests. Ends the loop if (for whatever reason) a short * packet is never read. */ #define USBTMC_MAX_READS_TO_CLEAR_BULK_IN 100 static const struct usb_device_id usbtmc_devices[] = { { USB_INTERFACE_INFO(USB_CLASS_APP_SPEC, 3, 0), }, { USB_INTERFACE_INFO(USB_CLASS_APP_SPEC, 3, 1), }, { 0, } /* terminating entry */ }; MODULE_DEVICE_TABLE(usb, usbtmc_devices); /* * This structure is the capabilities for the device * See section 4.2.1.8 of the USBTMC specification, * and section 4.2.2 of the USBTMC usb488 subclass * specification for details. */ struct usbtmc_dev_capabilities { __u8 interface_capabilities; __u8 device_capabilities; __u8 usb488_interface_capabilities; __u8 usb488_device_capabilities; }; /* This structure holds private data for each USBTMC device. One copy is * allocated for each USBTMC device in the driver's probe function. */ struct usbtmc_device_data { const struct usb_device_id *id; struct usb_device *usb_dev; struct usb_interface *intf; struct list_head file_list; unsigned int bulk_in; unsigned int bulk_out; u8 bTag; u8 bTag_last_write; /* needed for abort */ u8 bTag_last_read; /* needed for abort */ /* packet size of IN bulk */ u16 wMaxPacketSize; /* data for interrupt in endpoint handling */ u8 bNotify1; u8 bNotify2; u16 ifnum; u8 iin_bTag; u8 *iin_buffer; atomic_t iin_data_valid; unsigned int iin_ep; int iin_ep_present; int iin_interval; struct urb *iin_urb; u16 iin_wMaxPacketSize; /* coalesced usb488_caps from usbtmc_dev_capabilities */ __u8 usb488_caps; bool zombie; /* fd of disconnected device */ struct usbtmc_dev_capabilities capabilities; struct kref kref; struct mutex io_mutex; /* only one i/o function running at a time */ wait_queue_head_t waitq; struct fasync_struct *fasync; spinlock_t dev_lock; /* lock for file_list */ }; #define to_usbtmc_data(d) container_of(d, struct usbtmc_device_data, kref) /* * This structure holds private data for each USBTMC file handle. */ struct usbtmc_file_data { struct usbtmc_device_data *data; struct list_head file_elem; u32 timeout; u8 srq_byte; atomic_t srq_asserted; atomic_t closing; u8 bmTransferAttributes; /* member of DEV_DEP_MSG_IN */ u8 eom_val; u8 term_char; bool term_char_enabled; bool auto_abort; spinlock_t err_lock; /* lock for errors */ struct usb_anchor submitted; /* data for generic_write */ struct semaphore limit_write_sem; u32 out_transfer_size; int out_status; /* data for generic_read */ u32 in_transfer_size; int in_status; int in_urbs_used; struct usb_anchor in_anchor; wait_queue_head_t wait_bulk_in; }; /* Forward declarations */ static struct usb_driver usbtmc_driver; static void usbtmc_draw_down(struct usbtmc_file_data *file_data); static void usbtmc_delete(struct kref *kref) { struct usbtmc_device_data *data = to_usbtmc_data(kref); usb_put_dev(data->usb_dev); kfree(data); } static int usbtmc_open(struct inode *inode, struct file *filp) { struct usb_interface *intf; struct usbtmc_device_data *data; struct usbtmc_file_data *file_data; intf = usb_find_interface(&usbtmc_driver, iminor(inode)); if (!intf) { pr_err("can not find device for minor %d", iminor(inode)); return -ENODEV; } file_data = kzalloc(sizeof(*file_data), GFP_KERNEL); if (!file_data) return -ENOMEM; spin_lock_init(&file_data->err_lock); sema_init(&file_data->limit_write_sem, MAX_URBS_IN_FLIGHT); init_usb_anchor(&file_data->submitted); init_usb_anchor(&file_data->in_anchor); init_waitqueue_head(&file_data->wait_bulk_in); data = usb_get_intfdata(intf); /* Protect reference to data from file structure until release */ kref_get(&data->kref); mutex_lock(&data->io_mutex); file_data->data = data; atomic_set(&file_data->closing, 0); file_data->timeout = USBTMC_TIMEOUT; file_data->term_char = '\n'; file_data->term_char_enabled = 0; file_data->auto_abort = 0; file_data->eom_val = 1; INIT_LIST_HEAD(&file_data->file_elem); spin_lock_irq(&data->dev_lock); list_add_tail(&file_data->file_elem, &data->file_list); spin_unlock_irq(&data->dev_lock); mutex_unlock(&data->io_mutex); /* Store pointer in file structure's private data field */ filp->private_data = file_data; return 0; } /* * usbtmc_flush - called before file handle is closed */ static int usbtmc_flush(struct file *file, fl_owner_t id) { struct usbtmc_file_data *file_data; struct usbtmc_device_data *data; file_data = file->private_data; if (file_data == NULL) return -ENODEV; atomic_set(&file_data->closing, 1); data = file_data->data; /* wait for io to stop */ mutex_lock(&data->io_mutex); usbtmc_draw_down(file_data); spin_lock_irq(&file_data->err_lock); file_data->in_status = 0; file_data->in_transfer_size = 0; file_data->in_urbs_used = 0; file_data->out_status = 0; file_data->out_transfer_size = 0; spin_unlock_irq(&file_data->err_lock); wake_up_interruptible_all(&data->waitq); mutex_unlock(&data->io_mutex); return 0; } static int usbtmc_release(struct inode *inode, struct file *file) { struct usbtmc_file_data *file_data = file->private_data; /* prevent IO _AND_ usbtmc_interrupt */ mutex_lock(&file_data->data->io_mutex); spin_lock_irq(&file_data->data->dev_lock); list_del(&file_data->file_elem); spin_unlock_irq(&file_data->data->dev_lock); mutex_unlock(&file_data->data->io_mutex); kref_put(&file_data->data->kref, usbtmc_delete); file_data->data = NULL; kfree(file_data); return 0; } static int usbtmc_ioctl_abort_bulk_in_tag(struct usbtmc_device_data *data, u8 tag) { u8 *buffer; struct device *dev; int rv; int n; int actual; dev = &data->intf->dev; buffer = kmalloc(USBTMC_BUFSIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_INITIATE_ABORT_BULK_IN, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, tag, data->bulk_in, buffer, 2, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "INITIATE_ABORT_BULK_IN returned %x with tag %02x\n", buffer[0], buffer[1]); if (buffer[0] == USBTMC_STATUS_FAILED) { /* No transfer in progress and the Bulk-OUT FIFO is empty. */ rv = 0; goto exit; } if (buffer[0] == USBTMC_STATUS_TRANSFER_NOT_IN_PROGRESS) { /* The device returns this status if either: * - There is a transfer in progress, but the specified bTag * does not match. * - There is no transfer in progress, but the Bulk-OUT FIFO * is not empty. */ rv = -ENOMSG; goto exit; } if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "INITIATE_ABORT_BULK_IN returned %x\n", buffer[0]); rv = -EPERM; goto exit; } n = 0; usbtmc_abort_bulk_in_status: dev_dbg(dev, "Reading from bulk in EP\n"); /* Data must be present. So use low timeout 300 ms */ actual = 0; rv = usb_bulk_msg(data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in), buffer, USBTMC_BUFSIZE, &actual, 300); print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, actual, true); n++; if (rv < 0) { dev_err(dev, "usb_bulk_msg returned %d\n", rv); if (rv != -ETIMEDOUT) goto exit; } if (actual == USBTMC_BUFSIZE) goto usbtmc_abort_bulk_in_status; if (n >= USBTMC_MAX_READS_TO_CLEAR_BULK_IN) { dev_err(dev, "Couldn't clear device buffer within %d cycles\n", USBTMC_MAX_READS_TO_CLEAR_BULK_IN); rv = -EPERM; goto exit; } rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_CHECK_ABORT_BULK_IN_STATUS, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, 0, data->bulk_in, buffer, 0x08, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "CHECK_ABORT_BULK_IN returned %x\n", buffer[0]); if (buffer[0] == USBTMC_STATUS_SUCCESS) { rv = 0; goto exit; } if (buffer[0] != USBTMC_STATUS_PENDING) { dev_err(dev, "CHECK_ABORT_BULK_IN returned %x\n", buffer[0]); rv = -EPERM; goto exit; } if ((buffer[1] & 1) > 0) { /* The device has 1 or more queued packets the Host can read */ goto usbtmc_abort_bulk_in_status; } /* The Host must send CHECK_ABORT_BULK_IN_STATUS at a later time. */ rv = -EAGAIN; exit: kfree(buffer); return rv; } static int usbtmc_ioctl_abort_bulk_in(struct usbtmc_device_data *data) { return usbtmc_ioctl_abort_bulk_in_tag(data, data->bTag_last_read); } static int usbtmc_ioctl_abort_bulk_out_tag(struct usbtmc_device_data *data, u8 tag) { struct device *dev; u8 *buffer; int rv; int n; dev = &data->intf->dev; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_INITIATE_ABORT_BULK_OUT, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, tag, data->bulk_out, buffer, 2, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "INITIATE_ABORT_BULK_OUT returned %x\n", buffer[0]); if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "INITIATE_ABORT_BULK_OUT returned %x\n", buffer[0]); rv = -EPERM; goto exit; } n = 0; usbtmc_abort_bulk_out_check_status: /* do not stress device with subsequent requests */ msleep(50); rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_CHECK_ABORT_BULK_OUT_STATUS, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, 0, data->bulk_out, buffer, 0x08, USB_CTRL_GET_TIMEOUT); n++; if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "CHECK_ABORT_BULK_OUT returned %x\n", buffer[0]); if (buffer[0] == USBTMC_STATUS_SUCCESS) goto usbtmc_abort_bulk_out_clear_halt; if ((buffer[0] == USBTMC_STATUS_PENDING) && (n < USBTMC_MAX_READS_TO_CLEAR_BULK_IN)) goto usbtmc_abort_bulk_out_check_status; rv = -EPERM; goto exit; usbtmc_abort_bulk_out_clear_halt: rv = usb_clear_halt(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out)); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } rv = 0; exit: kfree(buffer); return rv; } static int usbtmc_ioctl_abort_bulk_out(struct usbtmc_device_data *data) { return usbtmc_ioctl_abort_bulk_out_tag(data, data->bTag_last_write); } static int usbtmc_get_stb(struct usbtmc_file_data *file_data, __u8 *stb) { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; u8 *buffer; u8 tag; int rv; dev_dbg(dev, "Enter ioctl_read_stb iin_ep_present: %d\n", data->iin_ep_present); buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return -ENOMEM; atomic_set(&data->iin_data_valid, 0); rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC488_REQUEST_READ_STATUS_BYTE, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, data->iin_bTag, data->ifnum, buffer, 0x03, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "stb usb_control_msg returned %d\n", rv); goto exit; } if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "control status returned %x\n", buffer[0]); rv = -EIO; goto exit; } if (data->iin_ep_present) { rv = wait_event_interruptible_timeout( data->waitq, atomic_read(&data->iin_data_valid) != 0, file_data->timeout); if (rv < 0) { dev_dbg(dev, "wait interrupted %d\n", rv); goto exit; } if (rv == 0) { dev_dbg(dev, "wait timed out\n"); rv = -ETIMEDOUT; goto exit; } tag = data->bNotify1 & 0x7f; if (tag != data->iin_bTag) { dev_err(dev, "expected bTag %x got %x\n", data->iin_bTag, tag); } *stb = data->bNotify2; } else { *stb = buffer[2]; } dev_dbg(dev, "stb:0x%02x received %d\n", (unsigned int)*stb, rv); exit: /* bump interrupt bTag */ data->iin_bTag += 1; if (data->iin_bTag > 127) /* 1 is for SRQ see USBTMC-USB488 subclass spec section 4.3.1 */ data->iin_bTag = 2; kfree(buffer); return rv; } static int usbtmc488_ioctl_read_stb(struct usbtmc_file_data *file_data, void __user *arg) { int srq_asserted = 0; __u8 stb; int rv; rv = usbtmc_get_stb(file_data, &stb); if (rv > 0) { srq_asserted = atomic_xchg(&file_data->srq_asserted, srq_asserted); if (srq_asserted) stb |= 0x40; /* Set RQS bit */ rv = put_user(stb, (__u8 __user *)arg); } return rv; } static int usbtmc_ioctl_get_srq_stb(struct usbtmc_file_data *file_data, void __user *arg) { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; int srq_asserted = 0; __u8 stb = 0; int rv; spin_lock_irq(&data->dev_lock); srq_asserted = atomic_xchg(&file_data->srq_asserted, srq_asserted); if (srq_asserted) { stb = file_data->srq_byte; spin_unlock_irq(&data->dev_lock); rv = put_user(stb, (__u8 __user *)arg); } else { spin_unlock_irq(&data->dev_lock); rv = -ENOMSG; } dev_dbg(dev, "stb:0x%02x with srq received %d\n", (unsigned int)stb, rv); return rv; } static int usbtmc488_ioctl_wait_srq(struct usbtmc_file_data *file_data, __u32 __user *arg) { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; int rv; u32 timeout; unsigned long expire; if (!data->iin_ep_present) { dev_dbg(dev, "no interrupt endpoint present\n"); return -EFAULT; } if (get_user(timeout, arg)) return -EFAULT; expire = msecs_to_jiffies(timeout); mutex_unlock(&data->io_mutex); rv = wait_event_interruptible_timeout( data->waitq, atomic_read(&file_data->srq_asserted) != 0 || atomic_read(&file_data->closing), expire); mutex_lock(&data->io_mutex); /* Note! disconnect or close could be called in the meantime */ if (atomic_read(&file_data->closing) || data->zombie) rv = -ENODEV; if (rv < 0) { /* dev can be invalid now! */ pr_debug("%s - wait interrupted %d\n", __func__, rv); return rv; } if (rv == 0) { dev_dbg(dev, "%s - wait timed out\n", __func__); return -ETIMEDOUT; } dev_dbg(dev, "%s - srq asserted\n", __func__); return 0; } static int usbtmc488_ioctl_simple(struct usbtmc_device_data *data, void __user *arg, unsigned int cmd) { struct device *dev = &data->intf->dev; __u8 val; u8 *buffer; u16 wValue; int rv; if (!(data->usb488_caps & USBTMC488_CAPABILITY_SIMPLE)) return -EINVAL; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return -ENOMEM; if (cmd == USBTMC488_REQUEST_REN_CONTROL) { rv = copy_from_user(&val, arg, sizeof(val)); if (rv) { rv = -EFAULT; goto exit; } wValue = val ? 1 : 0; } else { wValue = 0; } rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), cmd, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, wValue, data->ifnum, buffer, 0x01, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "simple usb_control_msg failed %d\n", rv); goto exit; } else if (rv != 1) { dev_warn(dev, "simple usb_control_msg returned %d\n", rv); rv = -EIO; goto exit; } if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "simple control status returned %x\n", buffer[0]); rv = -EIO; goto exit; } rv = 0; exit: kfree(buffer); return rv; } /* * Sends a TRIGGER Bulk-OUT command message * See the USBTMC-USB488 specification, Table 2. * * Also updates bTag_last_write. */ static int usbtmc488_ioctl_trigger(struct usbtmc_file_data *file_data) { struct usbtmc_device_data *data = file_data->data; int retval; u8 *buffer; int actual; buffer = kzalloc(USBTMC_HEADER_SIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; buffer[0] = 128; buffer[1] = data->bTag; buffer[2] = ~data->bTag; retval = usb_bulk_msg(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out), buffer, USBTMC_HEADER_SIZE, &actual, file_data->timeout); /* Store bTag (in case we need to abort) */ data->bTag_last_write = data->bTag; /* Increment bTag -- and increment again if zero */ data->bTag++; if (!data->bTag) data->bTag++; kfree(buffer); if (retval < 0) { dev_err(&data->intf->dev, "%s returned %d\n", __func__, retval); return retval; } return 0; } static struct urb *usbtmc_create_urb(void) { const size_t bufsize = USBTMC_BUFSIZE; u8 *dmabuf = NULL; struct urb *urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) return NULL; dmabuf = kmalloc(bufsize, GFP_KERNEL); if (!dmabuf) { usb_free_urb(urb); return NULL; } urb->transfer_buffer = dmabuf; urb->transfer_buffer_length = bufsize; urb->transfer_flags |= URB_FREE_BUFFER; return urb; } static void usbtmc_read_bulk_cb(struct urb *urb) { struct usbtmc_file_data *file_data = urb->context; int status = urb->status; unsigned long flags; /* sync/async unlink faults aren't errors */ if (status) { if (!(/* status == -ENOENT || */ status == -ECONNRESET || status == -EREMOTEIO || /* Short packet */ status == -ESHUTDOWN)) dev_err(&file_data->data->intf->dev, "%s - nonzero read bulk status received: %d\n", __func__, status); spin_lock_irqsave(&file_data->err_lock, flags); if (!file_data->in_status) file_data->in_status = status; spin_unlock_irqrestore(&file_data->err_lock, flags); } spin_lock_irqsave(&file_data->err_lock, flags); file_data->in_transfer_size += urb->actual_length; dev_dbg(&file_data->data->intf->dev, "%s - total size: %u current: %d status: %d\n", __func__, file_data->in_transfer_size, urb->actual_length, status); spin_unlock_irqrestore(&file_data->err_lock, flags); usb_anchor_urb(urb, &file_data->in_anchor); wake_up_interruptible(&file_data->wait_bulk_in); wake_up_interruptible(&file_data->data->waitq); } static inline bool usbtmc_do_transfer(struct usbtmc_file_data *file_data) { bool data_or_error; spin_lock_irq(&file_data->err_lock); data_or_error = !usb_anchor_empty(&file_data->in_anchor) || file_data->in_status; spin_unlock_irq(&file_data->err_lock); dev_dbg(&file_data->data->intf->dev, "%s: returns %d\n", __func__, data_or_error); return data_or_error; } static ssize_t usbtmc_generic_read(struct usbtmc_file_data *file_data, void __user *user_buffer, u32 transfer_size, u32 *transferred, u32 flags) { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; u32 done = 0; u32 remaining; const u32 bufsize = USBTMC_BUFSIZE; int retval = 0; u32 max_transfer_size; unsigned long expire; int bufcount = 1; int again = 0; /* mutex already locked */ *transferred = done; max_transfer_size = transfer_size; if (flags & USBTMC_FLAG_IGNORE_TRAILER) { /* The device may send extra alignment bytes (up to * wMaxPacketSize – 1) to avoid sending a zero-length * packet */ remaining = transfer_size; if ((max_transfer_size % data->wMaxPacketSize) == 0) max_transfer_size += (data->wMaxPacketSize - 1); } else { /* round down to bufsize to avoid truncated data left */ if (max_transfer_size > bufsize) { max_transfer_size = roundup(max_transfer_size + 1 - bufsize, bufsize); } remaining = max_transfer_size; } spin_lock_irq(&file_data->err_lock); if (file_data->in_status) { /* return the very first error */ retval = file_data->in_status; spin_unlock_irq(&file_data->err_lock); goto error; } if (flags & USBTMC_FLAG_ASYNC) { if (usb_anchor_empty(&file_data->in_anchor)) again = 1; if (file_data->in_urbs_used == 0) { file_data->in_transfer_size = 0; file_data->in_status = 0; } } else { file_data->in_transfer_size = 0; file_data->in_status = 0; } if (max_transfer_size == 0) { bufcount = 0; } else { bufcount = roundup(max_transfer_size, bufsize) / bufsize; if (bufcount > file_data->in_urbs_used) bufcount -= file_data->in_urbs_used; else bufcount = 0; if (bufcount + file_data->in_urbs_used > MAX_URBS_IN_FLIGHT) { bufcount = MAX_URBS_IN_FLIGHT - file_data->in_urbs_used; } } spin_unlock_irq(&file_data->err_lock); dev_dbg(dev, "%s: requested=%u flags=0x%X size=%u bufs=%d used=%d\n", __func__, transfer_size, flags, max_transfer_size, bufcount, file_data->in_urbs_used); while (bufcount > 0) { u8 *dmabuf = NULL; struct urb *urb = usbtmc_create_urb(); if (!urb) { retval = -ENOMEM; goto error; } dmabuf = urb->transfer_buffer; usb_fill_bulk_urb(urb, data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in), dmabuf, bufsize, usbtmc_read_bulk_cb, file_data); usb_anchor_urb(urb, &file_data->submitted); retval = usb_submit_urb(urb, GFP_KERNEL); /* urb is anchored. We can release our reference. */ usb_free_urb(urb); if (unlikely(retval)) { usb_unanchor_urb(urb); goto error; } file_data->in_urbs_used++; bufcount--; } if (again) { dev_dbg(dev, "%s: ret=again\n", __func__); return -EAGAIN; } if (user_buffer == NULL) return -EINVAL; expire = msecs_to_jiffies(file_data->timeout); while (max_transfer_size > 0) { u32 this_part; struct urb *urb = NULL; if (!(flags & USBTMC_FLAG_ASYNC)) { dev_dbg(dev, "%s: before wait time %lu\n", __func__, expire); retval = wait_event_interruptible_timeout( file_data->wait_bulk_in, usbtmc_do_transfer(file_data), expire); dev_dbg(dev, "%s: wait returned %d\n", __func__, retval); if (retval <= 0) { if (retval == 0) retval = -ETIMEDOUT; goto error; } } urb = usb_get_from_anchor(&file_data->in_anchor); if (!urb) { if (!(flags & USBTMC_FLAG_ASYNC)) { /* synchronous case: must not happen */ retval = -EFAULT; goto error; } /* asynchronous case: ready, do not block or wait */ *transferred = done; dev_dbg(dev, "%s: (async) done=%u ret=0\n", __func__, done); return 0; } file_data->in_urbs_used--; if (max_transfer_size > urb->actual_length) max_transfer_size -= urb->actual_length; else max_transfer_size = 0; if (remaining > urb->actual_length) this_part = urb->actual_length; else this_part = remaining; print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, urb->transfer_buffer, urb->actual_length, true); if (copy_to_user(user_buffer + done, urb->transfer_buffer, this_part)) { usb_free_urb(urb); retval = -EFAULT; goto error; } remaining -= this_part; done += this_part; spin_lock_irq(&file_data->err_lock); if (urb->status) { /* return the very first error */ retval = file_data->in_status; spin_unlock_irq(&file_data->err_lock); usb_free_urb(urb); goto error; } spin_unlock_irq(&file_data->err_lock); if (urb->actual_length < bufsize) { /* short packet or ZLP received => ready */ usb_free_urb(urb); retval = 1; break; } if (!(flags & USBTMC_FLAG_ASYNC) && max_transfer_size > (bufsize * file_data->in_urbs_used)) { /* resubmit, since other buffers still not enough */ usb_anchor_urb(urb, &file_data->submitted); retval = usb_submit_urb(urb, GFP_KERNEL); if (unlikely(retval)) { usb_unanchor_urb(urb); usb_free_urb(urb); goto error; } file_data->in_urbs_used++; } usb_free_urb(urb); retval = 0; } error: *transferred = done; dev_dbg(dev, "%s: before kill\n", __func__); /* Attention: killing urbs can take long time (2 ms) */ usb_kill_anchored_urbs(&file_data->submitted); dev_dbg(dev, "%s: after kill\n", __func__); usb_scuttle_anchored_urbs(&file_data->in_anchor); file_data->in_urbs_used = 0; file_data->in_status = 0; /* no spinlock needed here */ dev_dbg(dev, "%s: done=%u ret=%d\n", __func__, done, retval); return retval; } static ssize_t usbtmc_ioctl_generic_read(struct usbtmc_file_data *file_data, void __user *arg) { struct usbtmc_message msg; ssize_t retval = 0; /* mutex already locked */ if (copy_from_user(&msg, arg, sizeof(struct usbtmc_message))) return -EFAULT; retval = usbtmc_generic_read(file_data, msg.message, msg.transfer_size, &msg.transferred, msg.flags); if (put_user(msg.transferred, &((struct usbtmc_message __user *)arg)->transferred)) return -EFAULT; return retval; } static void usbtmc_write_bulk_cb(struct urb *urb) { struct usbtmc_file_data *file_data = urb->context; int wakeup = 0; unsigned long flags; spin_lock_irqsave(&file_data->err_lock, flags); file_data->out_transfer_size += urb->actual_length; /* sync/async unlink faults aren't errors */ if (urb->status) { if (!(urb->status == -ENOENT || urb->status == -ECONNRESET || urb->status == -ESHUTDOWN)) dev_err(&file_data->data->intf->dev, "%s - nonzero write bulk status received: %d\n", __func__, urb->status); if (!file_data->out_status) { file_data->out_status = urb->status; wakeup = 1; } } spin_unlock_irqrestore(&file_data->err_lock, flags); dev_dbg(&file_data->data->intf->dev, "%s - write bulk total size: %u\n", __func__, file_data->out_transfer_size); up(&file_data->limit_write_sem); if (usb_anchor_empty(&file_data->submitted) || wakeup) wake_up_interruptible(&file_data->data->waitq); } static ssize_t usbtmc_generic_write(struct usbtmc_file_data *file_data, const void __user *user_buffer, u32 transfer_size, u32 *transferred, u32 flags) { struct usbtmc_device_data *data = file_data->data; struct device *dev; u32 done = 0; u32 remaining; unsigned long expire; const u32 bufsize = USBTMC_BUFSIZE; struct urb *urb = NULL; int retval = 0; u32 timeout; *transferred = 0; /* Get pointer to private data structure */ dev = &data->intf->dev; dev_dbg(dev, "%s: size=%u flags=0x%X sema=%u\n", __func__, transfer_size, flags, file_data->limit_write_sem.count); if (flags & USBTMC_FLAG_APPEND) { spin_lock_irq(&file_data->err_lock); retval = file_data->out_status; spin_unlock_irq(&file_data->err_lock); if (retval < 0) return retval; } else { spin_lock_irq(&file_data->err_lock); file_data->out_transfer_size = 0; file_data->out_status = 0; spin_unlock_irq(&file_data->err_lock); } remaining = transfer_size; if (remaining > INT_MAX) remaining = INT_MAX; timeout = file_data->timeout; expire = msecs_to_jiffies(timeout); while (remaining > 0) { u32 this_part, aligned; u8 *buffer = NULL; if (flags & USBTMC_FLAG_ASYNC) { if (down_trylock(&file_data->limit_write_sem)) { retval = (done)?(0):(-EAGAIN); goto exit; } } else { retval = down_timeout(&file_data->limit_write_sem, expire); if (retval < 0) { retval = -ETIMEDOUT; goto error; } } spin_lock_irq(&file_data->err_lock); retval = file_data->out_status; spin_unlock_irq(&file_data->err_lock); if (retval < 0) { up(&file_data->limit_write_sem); goto error; } /* prepare next urb to send */ urb = usbtmc_create_urb(); if (!urb) { retval = -ENOMEM; up(&file_data->limit_write_sem); goto error; } buffer = urb->transfer_buffer; if (remaining > bufsize) this_part = bufsize; else this_part = remaining; if (copy_from_user(buffer, user_buffer + done, this_part)) { retval = -EFAULT; up(&file_data->limit_write_sem); goto error; } print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, this_part, true); /* fill bulk with 32 bit alignment to meet USBTMC specification * (size + 3 & ~3) rounds up and simplifies user code */ aligned = (this_part + 3) & ~3; dev_dbg(dev, "write(size:%u align:%u done:%u)\n", (unsigned int)this_part, (unsigned int)aligned, (unsigned int)done); usb_fill_bulk_urb(urb, data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out), urb->transfer_buffer, aligned, usbtmc_write_bulk_cb, file_data); usb_anchor_urb(urb, &file_data->submitted); retval = usb_submit_urb(urb, GFP_KERNEL); if (unlikely(retval)) { usb_unanchor_urb(urb); up(&file_data->limit_write_sem); goto error; } usb_free_urb(urb); urb = NULL; /* urb will be finally released by usb driver */ remaining -= this_part; done += this_part; } /* All urbs are on the fly */ if (!(flags & USBTMC_FLAG_ASYNC)) { if (!usb_wait_anchor_empty_timeout(&file_data->submitted, timeout)) { retval = -ETIMEDOUT; goto error; } } retval = 0; goto exit; error: usb_kill_anchored_urbs(&file_data->submitted); exit: usb_free_urb(urb); spin_lock_irq(&file_data->err_lock); if (!(flags & USBTMC_FLAG_ASYNC)) done = file_data->out_transfer_size; if (!retval && file_data->out_status) retval = file_data->out_status; spin_unlock_irq(&file_data->err_lock); *transferred = done; dev_dbg(dev, "%s: done=%u, retval=%d, urbstat=%d\n", __func__, done, retval, file_data->out_status); return retval; } static ssize_t usbtmc_ioctl_generic_write(struct usbtmc_file_data *file_data, void __user *arg) { struct usbtmc_message msg; ssize_t retval = 0; /* mutex already locked */ if (copy_from_user(&msg, arg, sizeof(struct usbtmc_message))) return -EFAULT; retval = usbtmc_generic_write(file_data, msg.message, msg.transfer_size, &msg.transferred, msg.flags); if (put_user(msg.transferred, &((struct usbtmc_message __user *)arg)->transferred)) return -EFAULT; return retval; } /* * Get the generic write result */ static ssize_t usbtmc_ioctl_write_result(struct usbtmc_file_data *file_data, void __user *arg) { u32 transferred; int retval; spin_lock_irq(&file_data->err_lock); transferred = file_data->out_transfer_size; retval = file_data->out_status; spin_unlock_irq(&file_data->err_lock); if (put_user(transferred, (__u32 __user *)arg)) return -EFAULT; return retval; } /* * Sends a REQUEST_DEV_DEP_MSG_IN message on the Bulk-OUT endpoint. * @transfer_size: number of bytes to request from the device. * * See the USBTMC specification, Table 4. * * Also updates bTag_last_write. */ static int send_request_dev_dep_msg_in(struct usbtmc_file_data *file_data, u32 transfer_size) { struct usbtmc_device_data *data = file_data->data; int retval; u8 *buffer; int actual; buffer = kmalloc(USBTMC_HEADER_SIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; /* Setup IO buffer for REQUEST_DEV_DEP_MSG_IN message * Refer to class specs for details */ buffer[0] = 2; buffer[1] = data->bTag; buffer[2] = ~data->bTag; buffer[3] = 0; /* Reserved */ buffer[4] = transfer_size >> 0; buffer[5] = transfer_size >> 8; buffer[6] = transfer_size >> 16; buffer[7] = transfer_size >> 24; buffer[8] = file_data->term_char_enabled * 2; /* Use term character? */ buffer[9] = file_data->term_char; buffer[10] = 0; /* Reserved */ buffer[11] = 0; /* Reserved */ /* Send bulk URB */ retval = usb_bulk_msg(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out), buffer, USBTMC_HEADER_SIZE, &actual, file_data->timeout); /* Store bTag (in case we need to abort) */ data->bTag_last_write = data->bTag; /* Increment bTag -- and increment again if zero */ data->bTag++; if (!data->bTag) data->bTag++; kfree(buffer); if (retval < 0) dev_err(&data->intf->dev, "%s returned %d\n", __func__, retval); return retval; } static ssize_t usbtmc_read(struct file *filp, char __user *buf, size_t count, loff_t *f_pos) { struct usbtmc_file_data *file_data; struct usbtmc_device_data *data; struct device *dev; const u32 bufsize = USBTMC_BUFSIZE; u32 n_characters; u8 *buffer; int actual; u32 done = 0; u32 remaining; int retval; /* Get pointer to private data structure */ file_data = filp->private_data; data = file_data->data; dev = &data->intf->dev; buffer = kmalloc(bufsize, GFP_KERNEL); if (!buffer) return -ENOMEM; mutex_lock(&data->io_mutex); if (data->zombie) { retval = -ENODEV; goto exit; } if (count > INT_MAX) count = INT_MAX; dev_dbg(dev, "%s(count:%zu)\n", __func__, count); retval = send_request_dev_dep_msg_in(file_data, count); if (retval < 0) { if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_out(data); goto exit; } /* Loop until we have fetched everything we requested */ remaining = count; actual = 0; /* Send bulk URB */ retval = usb_bulk_msg(data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in), buffer, bufsize, &actual, file_data->timeout); dev_dbg(dev, "%s: bulk_msg retval(%u), actual(%d)\n", __func__, retval, actual); /* Store bTag (in case we need to abort) */ data->bTag_last_read = data->bTag; if (retval < 0) { if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } /* Sanity checks for the header */ if (actual < USBTMC_HEADER_SIZE) { dev_err(dev, "Device sent too small first packet: %u < %u\n", actual, USBTMC_HEADER_SIZE); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } if (buffer[0] != 2) { dev_err(dev, "Device sent reply with wrong MsgID: %u != 2\n", buffer[0]); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } if (buffer[1] != data->bTag_last_write) { dev_err(dev, "Device sent reply with wrong bTag: %u != %u\n", buffer[1], data->bTag_last_write); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } /* How many characters did the instrument send? */ n_characters = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); file_data->bmTransferAttributes = buffer[8]; dev_dbg(dev, "Bulk-IN header: N_characters(%u), bTransAttr(%u)\n", n_characters, buffer[8]); if (n_characters > remaining) { dev_err(dev, "Device wants to return more data than requested: %u > %zu\n", n_characters, count); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_in(data); goto exit; } print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, actual, true); remaining = n_characters; /* Remove the USBTMC header */ actual -= USBTMC_HEADER_SIZE; /* Remove padding if it exists */ if (actual > remaining) actual = remaining; remaining -= actual; /* Copy buffer to user space */ if (copy_to_user(buf, &buffer[USBTMC_HEADER_SIZE], actual)) { /* There must have been an addressing problem */ retval = -EFAULT; goto exit; } if ((actual + USBTMC_HEADER_SIZE) == bufsize) { retval = usbtmc_generic_read(file_data, buf + actual, remaining, &done, USBTMC_FLAG_IGNORE_TRAILER); if (retval < 0) goto exit; } done += actual; /* Update file position value */ *f_pos = *f_pos + done; retval = done; exit: mutex_unlock(&data->io_mutex); kfree(buffer); return retval; } static ssize_t usbtmc_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { struct usbtmc_file_data *file_data; struct usbtmc_device_data *data; struct urb *urb = NULL; ssize_t retval = 0; u8 *buffer; u32 remaining, done; u32 transfersize, aligned, buflen; file_data = filp->private_data; data = file_data->data; mutex_lock(&data->io_mutex); if (data->zombie) { retval = -ENODEV; goto exit; } done = 0; spin_lock_irq(&file_data->err_lock); file_data->out_transfer_size = 0; file_data->out_status = 0; spin_unlock_irq(&file_data->err_lock); if (!count) goto exit; if (down_trylock(&file_data->limit_write_sem)) { /* previous calls were async */ retval = -EBUSY; goto exit; } urb = usbtmc_create_urb(); if (!urb) { retval = -ENOMEM; up(&file_data->limit_write_sem); goto exit; } buffer = urb->transfer_buffer; buflen = urb->transfer_buffer_length; if (count > INT_MAX) { transfersize = INT_MAX; buffer[8] = 0; } else { transfersize = count; buffer[8] = file_data->eom_val; } /* Setup IO buffer for DEV_DEP_MSG_OUT message */ buffer[0] = 1; buffer[1] = data->bTag; buffer[2] = ~data->bTag; buffer[3] = 0; /* Reserved */ buffer[4] = transfersize >> 0; buffer[5] = transfersize >> 8; buffer[6] = transfersize >> 16; buffer[7] = transfersize >> 24; /* buffer[8] is set above... */ buffer[9] = 0; /* Reserved */ buffer[10] = 0; /* Reserved */ buffer[11] = 0; /* Reserved */ remaining = transfersize; if (transfersize + USBTMC_HEADER_SIZE > buflen) { transfersize = buflen - USBTMC_HEADER_SIZE; aligned = buflen; } else { aligned = (transfersize + (USBTMC_HEADER_SIZE + 3)) & ~3; } if (copy_from_user(&buffer[USBTMC_HEADER_SIZE], buf, transfersize)) { retval = -EFAULT; up(&file_data->limit_write_sem); goto exit; } dev_dbg(&data->intf->dev, "%s(size:%u align:%u)\n", __func__, (unsigned int)transfersize, (unsigned int)aligned); print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, aligned, true); usb_fill_bulk_urb(urb, data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out), urb->transfer_buffer, aligned, usbtmc_write_bulk_cb, file_data); usb_anchor_urb(urb, &file_data->submitted); retval = usb_submit_urb(urb, GFP_KERNEL); if (unlikely(retval)) { usb_unanchor_urb(urb); up(&file_data->limit_write_sem); goto exit; } remaining -= transfersize; data->bTag_last_write = data->bTag; data->bTag++; if (!data->bTag) data->bTag++; /* call generic_write even when remaining = 0 */ retval = usbtmc_generic_write(file_data, buf + transfersize, remaining, &done, USBTMC_FLAG_APPEND); /* truncate alignment bytes */ if (done > remaining) done = remaining; /*add size of first urb*/ done += transfersize; if (retval < 0) { usb_kill_anchored_urbs(&file_data->submitted); dev_err(&data->intf->dev, "Unable to send data, error %d\n", (int)retval); if (file_data->auto_abort) usbtmc_ioctl_abort_bulk_out(data); goto exit; } retval = done; exit: usb_free_urb(urb); mutex_unlock(&data->io_mutex); return retval; } static int usbtmc_ioctl_clear(struct usbtmc_device_data *data) { struct device *dev; u8 *buffer; int rv; int n; int actual = 0; dev = &data->intf->dev; dev_dbg(dev, "Sending INITIATE_CLEAR request\n"); buffer = kmalloc(USBTMC_BUFSIZE, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_INITIATE_CLEAR, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, buffer, 1, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "INITIATE_CLEAR returned %x\n", buffer[0]); if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "INITIATE_CLEAR returned %x\n", buffer[0]); rv = -EPERM; goto exit; } n = 0; usbtmc_clear_check_status: dev_dbg(dev, "Sending CHECK_CLEAR_STATUS request\n"); rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_CHECK_CLEAR_STATUS, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, buffer, 2, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "CHECK_CLEAR_STATUS returned %x\n", buffer[0]); if (buffer[0] == USBTMC_STATUS_SUCCESS) goto usbtmc_clear_bulk_out_halt; if (buffer[0] != USBTMC_STATUS_PENDING) { dev_err(dev, "CHECK_CLEAR_STATUS returned %x\n", buffer[0]); rv = -EPERM; goto exit; } if ((buffer[1] & 1) != 0) { do { dev_dbg(dev, "Reading from bulk in EP\n"); actual = 0; rv = usb_bulk_msg(data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in), buffer, USBTMC_BUFSIZE, &actual, USB_CTRL_GET_TIMEOUT); print_hex_dump_debug("usbtmc ", DUMP_PREFIX_NONE, 16, 1, buffer, actual, true); n++; if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } } while ((actual == USBTMC_BUFSIZE) && (n < USBTMC_MAX_READS_TO_CLEAR_BULK_IN)); } else { /* do not stress device with subsequent requests */ msleep(50); n++; } if (n >= USBTMC_MAX_READS_TO_CLEAR_BULK_IN) { dev_err(dev, "Couldn't clear device buffer within %d cycles\n", USBTMC_MAX_READS_TO_CLEAR_BULK_IN); rv = -EPERM; goto exit; } goto usbtmc_clear_check_status; usbtmc_clear_bulk_out_halt: rv = usb_clear_halt(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out)); if (rv < 0) { dev_err(dev, "usb_clear_halt returned %d\n", rv); goto exit; } rv = 0; exit: kfree(buffer); return rv; } static int usbtmc_ioctl_clear_out_halt(struct usbtmc_device_data *data) { int rv; rv = usb_clear_halt(data->usb_dev, usb_sndbulkpipe(data->usb_dev, data->bulk_out)); if (rv < 0) dev_err(&data->usb_dev->dev, "%s returned %d\n", __func__, rv); return rv; } static int usbtmc_ioctl_clear_in_halt(struct usbtmc_device_data *data) { int rv; rv = usb_clear_halt(data->usb_dev, usb_rcvbulkpipe(data->usb_dev, data->bulk_in)); if (rv < 0) dev_err(&data->usb_dev->dev, "%s returned %d\n", __func__, rv); return rv; } static int usbtmc_ioctl_cancel_io(struct usbtmc_file_data *file_data) { spin_lock_irq(&file_data->err_lock); file_data->in_status = -ECANCELED; file_data->out_status = -ECANCELED; spin_unlock_irq(&file_data->err_lock); usb_kill_anchored_urbs(&file_data->submitted); return 0; } static int usbtmc_ioctl_cleanup_io(struct usbtmc_file_data *file_data) { usb_kill_anchored_urbs(&file_data->submitted); usb_scuttle_anchored_urbs(&file_data->in_anchor); spin_lock_irq(&file_data->err_lock); file_data->in_status = 0; file_data->in_transfer_size = 0; file_data->out_status = 0; file_data->out_transfer_size = 0; spin_unlock_irq(&file_data->err_lock); file_data->in_urbs_used = 0; return 0; } static int get_capabilities(struct usbtmc_device_data *data) { struct device *dev = &data->usb_dev->dev; char *buffer; int rv = 0; buffer = kmalloc(0x18, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_GET_CAPABILITIES, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, buffer, 0x18, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto err_out; } dev_dbg(dev, "GET_CAPABILITIES returned %x\n", buffer[0]); if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "GET_CAPABILITIES returned %x\n", buffer[0]); rv = -EPERM; goto err_out; } dev_dbg(dev, "Interface capabilities are %x\n", buffer[4]); dev_dbg(dev, "Device capabilities are %x\n", buffer[5]); dev_dbg(dev, "USB488 interface capabilities are %x\n", buffer[14]); dev_dbg(dev, "USB488 device capabilities are %x\n", buffer[15]); data->capabilities.interface_capabilities = buffer[4]; data->capabilities.device_capabilities = buffer[5]; data->capabilities.usb488_interface_capabilities = buffer[14]; data->capabilities.usb488_device_capabilities = buffer[15]; data->usb488_caps = (buffer[14] & 0x07) | ((buffer[15] & 0x0f) << 4); rv = 0; err_out: kfree(buffer); return rv; } #define capability_attribute(name) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_interface *intf = to_usb_interface(dev); \ struct usbtmc_device_data *data = usb_get_intfdata(intf); \ \ return sprintf(buf, "%d\n", data->capabilities.name); \ } \ static DEVICE_ATTR_RO(name) capability_attribute(interface_capabilities); capability_attribute(device_capabilities); capability_attribute(usb488_interface_capabilities); capability_attribute(usb488_device_capabilities); static struct attribute *usbtmc_attrs[] = { &dev_attr_interface_capabilities.attr, &dev_attr_device_capabilities.attr, &dev_attr_usb488_interface_capabilities.attr, &dev_attr_usb488_device_capabilities.attr, NULL, }; ATTRIBUTE_GROUPS(usbtmc); static int usbtmc_ioctl_indicator_pulse(struct usbtmc_device_data *data) { struct device *dev; u8 *buffer; int rv; dev = &data->intf->dev; buffer = kmalloc(2, GFP_KERNEL); if (!buffer) return -ENOMEM; rv = usb_control_msg(data->usb_dev, usb_rcvctrlpipe(data->usb_dev, 0), USBTMC_REQUEST_INDICATOR_PULSE, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, buffer, 0x01, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "usb_control_msg returned %d\n", rv); goto exit; } dev_dbg(dev, "INDICATOR_PULSE returned %x\n", buffer[0]); if (buffer[0] != USBTMC_STATUS_SUCCESS) { dev_err(dev, "INDICATOR_PULSE returned %x\n", buffer[0]); rv = -EPERM; goto exit; } rv = 0; exit: kfree(buffer); return rv; } static int usbtmc_ioctl_request(struct usbtmc_device_data *data, void __user *arg) { struct device *dev = &data->intf->dev; struct usbtmc_ctrlrequest request; u8 *buffer = NULL; int rv; unsigned int is_in, pipe; unsigned long res; res = copy_from_user(&request, arg, sizeof(struct usbtmc_ctrlrequest)); if (res) return -EFAULT; if (request.req.wLength > USBTMC_BUFSIZE) return -EMSGSIZE; if (request.req.wLength == 0) /* Length-0 requests are never IN */ request.req.bRequestType &= ~USB_DIR_IN; is_in = request.req.bRequestType & USB_DIR_IN; if (request.req.wLength) { buffer = kmalloc(request.req.wLength, GFP_KERNEL); if (!buffer) return -ENOMEM; if (!is_in) { /* Send control data to device */ res = copy_from_user(buffer, request.data, request.req.wLength); if (res) { rv = -EFAULT; goto exit; } } } if (is_in) pipe = usb_rcvctrlpipe(data->usb_dev, 0); else pipe = usb_sndctrlpipe(data->usb_dev, 0); rv = usb_control_msg(data->usb_dev, pipe, request.req.bRequest, request.req.bRequestType, request.req.wValue, request.req.wIndex, buffer, request.req.wLength, USB_CTRL_GET_TIMEOUT); if (rv < 0) { dev_err(dev, "%s failed %d\n", __func__, rv); goto exit; } if (rv && is_in) { /* Read control data from device */ res = copy_to_user(request.data, buffer, rv); if (res) rv = -EFAULT; } exit: kfree(buffer); return rv; } /* * Get the usb timeout value */ static int usbtmc_ioctl_get_timeout(struct usbtmc_file_data *file_data, void __user *arg) { u32 timeout; timeout = file_data->timeout; return put_user(timeout, (__u32 __user *)arg); } /* * Set the usb timeout value */ static int usbtmc_ioctl_set_timeout(struct usbtmc_file_data *file_data, void __user *arg) { u32 timeout; if (get_user(timeout, (__u32 __user *)arg)) return -EFAULT; /* Note that timeout = 0 means * MAX_SCHEDULE_TIMEOUT in usb_control_msg */ if (timeout < USBTMC_MIN_TIMEOUT) return -EINVAL; file_data->timeout = timeout; return 0; } /* * enables/disables sending EOM on write */ static int usbtmc_ioctl_eom_enable(struct usbtmc_file_data *file_data, void __user *arg) { u8 eom_enable; if (copy_from_user(&eom_enable, arg, sizeof(eom_enable))) return -EFAULT; if (eom_enable > 1) return -EINVAL; file_data->eom_val = eom_enable; return 0; } /* * Configure termination character for read() */ static int usbtmc_ioctl_config_termc(struct usbtmc_file_data *file_data, void __user *arg) { struct usbtmc_termchar termc; if (copy_from_user(&termc, arg, sizeof(termc))) return -EFAULT; if ((termc.term_char_enabled > 1) || (termc.term_char_enabled && !(file_data->data->capabilities.device_capabilities & 1))) return -EINVAL; file_data->term_char = termc.term_char; file_data->term_char_enabled = termc.term_char_enabled; return 0; } static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct usbtmc_file_data *file_data; struct usbtmc_device_data *data; int retval = -EBADRQC; __u8 tmp_byte; file_data = file->private_data; data = file_data->data; mutex_lock(&data->io_mutex); if (data->zombie) { retval = -ENODEV; goto skip_io_on_zombie; } switch (cmd) { case USBTMC_IOCTL_CLEAR_OUT_HALT: retval = usbtmc_ioctl_clear_out_halt(data); break; case USBTMC_IOCTL_CLEAR_IN_HALT: retval = usbtmc_ioctl_clear_in_halt(data); break; case USBTMC_IOCTL_INDICATOR_PULSE: retval = usbtmc_ioctl_indicator_pulse(data); break; case USBTMC_IOCTL_CLEAR: retval = usbtmc_ioctl_clear(data); break; case USBTMC_IOCTL_ABORT_BULK_OUT: retval = usbtmc_ioctl_abort_bulk_out(data); break; case USBTMC_IOCTL_ABORT_BULK_IN: retval = usbtmc_ioctl_abort_bulk_in(data); break; case USBTMC_IOCTL_CTRL_REQUEST: retval = usbtmc_ioctl_request(data, (void __user *)arg); break; case USBTMC_IOCTL_GET_TIMEOUT: retval = usbtmc_ioctl_get_timeout(file_data, (void __user *)arg); break; case USBTMC_IOCTL_SET_TIMEOUT: retval = usbtmc_ioctl_set_timeout(file_data, (void __user *)arg); break; case USBTMC_IOCTL_EOM_ENABLE: retval = usbtmc_ioctl_eom_enable(file_data, (void __user *)arg); break; case USBTMC_IOCTL_CONFIG_TERMCHAR: retval = usbtmc_ioctl_config_termc(file_data, (void __user *)arg); break; case USBTMC_IOCTL_WRITE: retval = usbtmc_ioctl_generic_write(file_data, (void __user *)arg); break; case USBTMC_IOCTL_READ: retval = usbtmc_ioctl_generic_read(file_data, (void __user *)arg); break; case USBTMC_IOCTL_WRITE_RESULT: retval = usbtmc_ioctl_write_result(file_data, (void __user *)arg); break; case USBTMC_IOCTL_API_VERSION: retval = put_user(USBTMC_API_VERSION, (__u32 __user *)arg); break; case USBTMC488_IOCTL_GET_CAPS: retval = put_user(data->usb488_caps, (unsigned char __user *)arg); break; case USBTMC488_IOCTL_READ_STB: retval = usbtmc488_ioctl_read_stb(file_data, (void __user *)arg); break; case USBTMC488_IOCTL_REN_CONTROL: retval = usbtmc488_ioctl_simple(data, (void __user *)arg, USBTMC488_REQUEST_REN_CONTROL); break; case USBTMC488_IOCTL_GOTO_LOCAL: retval = usbtmc488_ioctl_simple(data, (void __user *)arg, USBTMC488_REQUEST_GOTO_LOCAL); break; case USBTMC488_IOCTL_LOCAL_LOCKOUT: retval = usbtmc488_ioctl_simple(data, (void __user *)arg, USBTMC488_REQUEST_LOCAL_LOCKOUT); break; case USBTMC488_IOCTL_TRIGGER: retval = usbtmc488_ioctl_trigger(file_data); break; case USBTMC488_IOCTL_WAIT_SRQ: retval = usbtmc488_ioctl_wait_srq(file_data, (__u32 __user *)arg); break; case USBTMC_IOCTL_MSG_IN_ATTR: retval = put_user(file_data->bmTransferAttributes, (__u8 __user *)arg); break; case USBTMC_IOCTL_AUTO_ABORT: retval = get_user(tmp_byte, (unsigned char __user *)arg); if (retval == 0) file_data->auto_abort = !!tmp_byte; break; case USBTMC_IOCTL_GET_STB: retval = usbtmc_get_stb(file_data, &tmp_byte); if (retval > 0) retval = put_user(tmp_byte, (__u8 __user *)arg); break; case USBTMC_IOCTL_GET_SRQ_STB: retval = usbtmc_ioctl_get_srq_stb(file_data, (void __user *)arg); break; case USBTMC_IOCTL_CANCEL_IO: retval = usbtmc_ioctl_cancel_io(file_data); break; case USBTMC_IOCTL_CLEANUP_IO: retval = usbtmc_ioctl_cleanup_io(file_data); break; } skip_io_on_zombie: mutex_unlock(&data->io_mutex); return retval; } static int usbtmc_fasync(int fd, struct file *file, int on) { struct usbtmc_file_data *file_data = file->private_data; return fasync_helper(fd, file, on, &file_data->data->fasync); } static __poll_t usbtmc_poll(struct file *file, poll_table *wait) { struct usbtmc_file_data *file_data = file->private_data; struct usbtmc_device_data *data = file_data->data; __poll_t mask; mutex_lock(&data->io_mutex); if (data->zombie) { mask = EPOLLHUP | EPOLLERR; goto no_poll; } poll_wait(file, &data->waitq, wait); /* Note that EPOLLPRI is now assigned to SRQ, and * EPOLLIN|EPOLLRDNORM to normal read data. */ mask = 0; if (atomic_read(&file_data->srq_asserted)) mask |= EPOLLPRI; /* Note that the anchor submitted includes all urbs for BULK IN * and OUT. So EPOLLOUT is signaled when BULK OUT is empty and * all BULK IN urbs are completed and moved to in_anchor. */ if (usb_anchor_empty(&file_data->submitted)) mask |= (EPOLLOUT | EPOLLWRNORM); if (!usb_anchor_empty(&file_data->in_anchor)) mask |= (EPOLLIN | EPOLLRDNORM); spin_lock_irq(&file_data->err_lock); if (file_data->in_status || file_data->out_status) mask |= EPOLLERR; spin_unlock_irq(&file_data->err_lock); dev_dbg(&data->intf->dev, "poll mask = %x\n", mask); no_poll: mutex_unlock(&data->io_mutex); return mask; } static const struct file_operations fops = { .owner = THIS_MODULE, .read = usbtmc_read, .write = usbtmc_write, .open = usbtmc_open, .release = usbtmc_release, .flush = usbtmc_flush, .unlocked_ioctl = usbtmc_ioctl, .compat_ioctl = compat_ptr_ioctl, .fasync = usbtmc_fasync, .poll = usbtmc_poll, .llseek = default_llseek, }; static struct usb_class_driver usbtmc_class = { .name = "usbtmc%d", .fops = &fops, .minor_base = USBTMC_MINOR_BASE, }; static void usbtmc_interrupt(struct urb *urb) { struct usbtmc_device_data *data = urb->context; struct device *dev = &data->intf->dev; int status = urb->status; int rv; dev_dbg(&data->intf->dev, "int status: %d len %d\n", status, urb->actual_length); switch (status) { case 0: /* SUCCESS */ /* check for valid STB notification */ if (data->iin_buffer[0] > 0x81) { data->bNotify1 = data->iin_buffer[0]; data->bNotify2 = data->iin_buffer[1]; atomic_set(&data->iin_data_valid, 1); wake_up_interruptible(&data->waitq); goto exit; } /* check for SRQ notification */ if (data->iin_buffer[0] == 0x81) { unsigned long flags; struct list_head *elem; if (data->fasync) kill_fasync(&data->fasync, SIGIO, POLL_PRI); spin_lock_irqsave(&data->dev_lock, flags); list_for_each(elem, &data->file_list) { struct usbtmc_file_data *file_data; file_data = list_entry(elem, struct usbtmc_file_data, file_elem); file_data->srq_byte = data->iin_buffer[1]; atomic_set(&file_data->srq_asserted, 1); } spin_unlock_irqrestore(&data->dev_lock, flags); dev_dbg(dev, "srq received bTag %x stb %x\n", (unsigned int)data->iin_buffer[0], (unsigned int)data->iin_buffer[1]); wake_up_interruptible_all(&data->waitq); goto exit; } dev_warn(dev, "invalid notification: %x\n", data->iin_buffer[0]); break; case -EOVERFLOW: dev_err(dev, "overflow with length %d, actual length is %d\n", data->iin_wMaxPacketSize, urb->actual_length); fallthrough; default: /* urb terminated, clean up */ dev_dbg(dev, "urb terminated, status: %d\n", status); return; } exit: rv = usb_submit_urb(urb, GFP_ATOMIC); if (rv) dev_err(dev, "usb_submit_urb failed: %d\n", rv); } static void usbtmc_free_int(struct usbtmc_device_data *data) { if (!data->iin_ep_present || !data->iin_urb) return; usb_kill_urb(data->iin_urb); kfree(data->iin_buffer); data->iin_buffer = NULL; usb_free_urb(data->iin_urb); data->iin_urb = NULL; kref_put(&data->kref, usbtmc_delete); } static int usbtmc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usbtmc_device_data *data; struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *bulk_in, *bulk_out, *int_in; int retcode; dev_dbg(&intf->dev, "%s called\n", __func__); data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->intf = intf; data->id = id; data->usb_dev = usb_get_dev(interface_to_usbdev(intf)); usb_set_intfdata(intf, data); kref_init(&data->kref); mutex_init(&data->io_mutex); init_waitqueue_head(&data->waitq); atomic_set(&data->iin_data_valid, 0); INIT_LIST_HEAD(&data->file_list); spin_lock_init(&data->dev_lock); data->zombie = 0; /* Initialize USBTMC bTag and other fields */ data->bTag = 1; /* 2 <= bTag <= 127 USBTMC-USB488 subclass specification 4.3.1 */ data->iin_bTag = 2; /* USBTMC devices have only one setting, so use that */ iface_desc = data->intf->cur_altsetting; data->ifnum = iface_desc->desc.bInterfaceNumber; /* Find bulk endpoints */ retcode = usb_find_common_endpoints(iface_desc, &bulk_in, &bulk_out, NULL, NULL); if (retcode) { dev_err(&intf->dev, "bulk endpoints not found\n"); goto err_put; } retcode = -EINVAL; data->bulk_in = bulk_in->bEndpointAddress; data->wMaxPacketSize = usb_endpoint_maxp(bulk_in); if (!data->wMaxPacketSize) goto err_put; dev_dbg(&intf->dev, "Found bulk in endpoint at %u\n", data->bulk_in); data->bulk_out = bulk_out->bEndpointAddress; dev_dbg(&intf->dev, "Found Bulk out endpoint at %u\n", data->bulk_out); /* Find int endpoint */ retcode = usb_find_int_in_endpoint(iface_desc, &int_in); if (!retcode) { data->iin_ep_present = 1; data->iin_ep = int_in->bEndpointAddress; data->iin_wMaxPacketSize = usb_endpoint_maxp(int_in); data->iin_interval = int_in->bInterval; dev_dbg(&intf->dev, "Found Int in endpoint at %u\n", data->iin_ep); } retcode = get_capabilities(data); if (retcode) dev_err(&intf->dev, "can't read capabilities\n"); if (data->iin_ep_present) { /* allocate int urb */ data->iin_urb = usb_alloc_urb(0, GFP_KERNEL); if (!data->iin_urb) { retcode = -ENOMEM; goto error_register; } /* Protect interrupt in endpoint data until iin_urb is freed */ kref_get(&data->kref); /* allocate buffer for interrupt in */ data->iin_buffer = kmalloc(data->iin_wMaxPacketSize, GFP_KERNEL); if (!data->iin_buffer) { retcode = -ENOMEM; goto error_register; } /* fill interrupt urb */ usb_fill_int_urb(data->iin_urb, data->usb_dev, usb_rcvintpipe(data->usb_dev, data->iin_ep), data->iin_buffer, data->iin_wMaxPacketSize, usbtmc_interrupt, data, data->iin_interval); retcode = usb_submit_urb(data->iin_urb, GFP_KERNEL); if (retcode) { dev_err(&intf->dev, "Failed to submit iin_urb\n"); goto error_register; } } retcode = usb_register_dev(intf, &usbtmc_class); if (retcode) { dev_err(&intf->dev, "Not able to get a minor (base %u, slice default): %d\n", USBTMC_MINOR_BASE, retcode); goto error_register; } dev_dbg(&intf->dev, "Using minor number %d\n", intf->minor); return 0; error_register: usbtmc_free_int(data); err_put: kref_put(&data->kref, usbtmc_delete); return retcode; } static void usbtmc_disconnect(struct usb_interface *intf) { struct usbtmc_device_data *data = usb_get_intfdata(intf); struct list_head *elem; usb_deregister_dev(intf, &usbtmc_class); mutex_lock(&data->io_mutex); data->zombie = 1; wake_up_interruptible_all(&data->waitq); list_for_each(elem, &data->file_list) { struct usbtmc_file_data *file_data; file_data = list_entry(elem, struct usbtmc_file_data, file_elem); usb_kill_anchored_urbs(&file_data->submitted); usb_scuttle_anchored_urbs(&file_data->in_anchor); } mutex_unlock(&data->io_mutex); usbtmc_free_int(data); kref_put(&data->kref, usbtmc_delete); } static void usbtmc_draw_down(struct usbtmc_file_data *file_data) { int time; time = usb_wait_anchor_empty_timeout(&file_data->submitted, 1000); if (!time) usb_kill_anchored_urbs(&file_data->submitted); usb_scuttle_anchored_urbs(&file_data->in_anchor); } static int usbtmc_suspend(struct usb_interface *intf, pm_message_t message) { struct usbtmc_device_data *data = usb_get_intfdata(intf); struct list_head *elem; if (!data) return 0; mutex_lock(&data->io_mutex); list_for_each(elem, &data->file_list) { struct usbtmc_file_data *file_data; file_data = list_entry(elem, struct usbtmc_file_data, file_elem); usbtmc_draw_down(file_data); } if (data->iin_ep_present && data->iin_urb) usb_kill_urb(data->iin_urb); mutex_unlock(&data->io_mutex); return 0; } static int usbtmc_resume(struct usb_interface *intf) { struct usbtmc_device_data *data = usb_get_intfdata(intf); int retcode = 0; if (data->iin_ep_present && data->iin_urb) retcode = usb_submit_urb(data->iin_urb, GFP_KERNEL); if (retcode) dev_err(&intf->dev, "Failed to submit iin_urb\n"); return retcode; } static int usbtmc_pre_reset(struct usb_interface *intf) { struct usbtmc_device_data *data = usb_get_intfdata(intf); struct list_head *elem; if (!data) return 0; mutex_lock(&data->io_mutex); list_for_each(elem, &data->file_list) { struct usbtmc_file_data *file_data; file_data = list_entry(elem, struct usbtmc_file_data, file_elem); usbtmc_ioctl_cancel_io(file_data); } return 0; } static int usbtmc_post_reset(struct usb_interface *intf) { struct usbtmc_device_data *data = usb_get_intfdata(intf); mutex_unlock(&data->io_mutex); return 0; } static struct usb_driver usbtmc_driver = { .name = "usbtmc", .id_table = usbtmc_devices, .probe = usbtmc_probe, .disconnect = usbtmc_disconnect, .suspend = usbtmc_suspend, .resume = usbtmc_resume, .pre_reset = usbtmc_pre_reset, .post_reset = usbtmc_post_reset, .dev_groups = usbtmc_groups, }; module_usb_driver(usbtmc_driver); MODULE_LICENSE("GPL"); |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 | // SPDX-License-Identifier: GPL-2.0-only /* * HID driver for the apple ir device * * Original driver written by James McKenzie * Ported to recent 2.6 kernel versions by Greg Kroah-Hartman <gregkh@suse.de> * Updated to support newer remotes by Bastien Nocera <hadess@hadess.net> * Ported to HID subsystem by Benjamin Tissoires <benjamin.tissoires@gmail.com> * * Copyright (C) 2006 James McKenzie * Copyright (C) 2008 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2008 Novell Inc. * Copyright (C) 2010, 2012 Bastien Nocera <hadess@hadess.net> * Copyright (C) 2013 Benjamin Tissoires <benjamin.tissoires@gmail.com> * Copyright (C) 2013 Red Hat Inc. All Rights Reserved */ #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" MODULE_AUTHOR("James McKenzie"); MODULE_AUTHOR("Benjamin Tissoires <benjamin.tissoires@redhat.com>"); MODULE_DESCRIPTION("HID Apple IR remote controls"); MODULE_LICENSE("GPL"); #define KEY_MASK 0x0F #define TWO_PACKETS_MASK 0x40 /* * James McKenzie has two devices both of which report the following * 25 87 ee 83 0a + * 25 87 ee 83 0c - * 25 87 ee 83 09 << * 25 87 ee 83 06 >> * 25 87 ee 83 05 >" * 25 87 ee 83 03 menu * 26 00 00 00 00 for key repeat */ /* * Thomas Glanzmann reports the following responses * 25 87 ee ca 0b + * 25 87 ee ca 0d - * 25 87 ee ca 08 << * 25 87 ee ca 07 >> * 25 87 ee ca 04 >" * 25 87 ee ca 02 menu * 26 00 00 00 00 for key repeat * * He also observes the following event sometimes * sent after a key is release, which I interpret * as a flat battery message * 25 87 e0 ca 06 flat battery */ /* * Alexandre Karpenko reports the following responses for Device ID 0x8242 * 25 87 ee 47 0b + * 25 87 ee 47 0d - * 25 87 ee 47 08 << * 25 87 ee 47 07 >> * 25 87 ee 47 04 >" * 25 87 ee 47 02 menu * 26 87 ee 47 ** for key repeat (** is the code of the key being held) */ /* * Bastien Nocera's remote * 25 87 ee 91 5f followed by * 25 87 ee 91 05 gives you >" * * 25 87 ee 91 5c followed by * 25 87 ee 91 05 gives you the middle button */ /* * Fabien Andre's remote * 25 87 ee a3 5e followed by * 25 87 ee a3 04 gives you >" * * 25 87 ee a3 5d followed by * 25 87 ee a3 04 gives you the middle button */ static const unsigned short appleir_key_table[] = { KEY_RESERVED, KEY_MENU, KEY_PLAYPAUSE, KEY_FORWARD, KEY_BACK, KEY_VOLUMEUP, KEY_VOLUMEDOWN, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_RESERVED, KEY_ENTER, KEY_PLAYPAUSE, KEY_RESERVED, }; struct appleir { struct input_dev *input_dev; struct hid_device *hid; unsigned short keymap[ARRAY_SIZE(appleir_key_table)]; struct timer_list key_up_timer; /* timer for key up */ spinlock_t lock; /* protects .current_key */ int current_key; /* the currently pressed key */ int prev_key_idx; /* key index in a 2 packets message */ }; static int get_key(int data) { /* * The key is coded accross bits 2..9: * * 0x00 or 0x01 ( ) key: 0 -> KEY_RESERVED * 0x02 or 0x03 ( menu ) key: 1 -> KEY_MENU * 0x04 or 0x05 ( >" ) key: 2 -> KEY_PLAYPAUSE * 0x06 or 0x07 ( >> ) key: 3 -> KEY_FORWARD * 0x08 or 0x09 ( << ) key: 4 -> KEY_BACK * 0x0a or 0x0b ( + ) key: 5 -> KEY_VOLUMEUP * 0x0c or 0x0d ( - ) key: 6 -> KEY_VOLUMEDOWN * 0x0e or 0x0f ( ) key: 7 -> KEY_RESERVED * 0x50 or 0x51 ( ) key: 8 -> KEY_RESERVED * 0x52 or 0x53 ( ) key: 9 -> KEY_RESERVED * 0x54 or 0x55 ( ) key: 10 -> KEY_RESERVED * 0x56 or 0x57 ( ) key: 11 -> KEY_RESERVED * 0x58 or 0x59 ( ) key: 12 -> KEY_RESERVED * 0x5a or 0x5b ( ) key: 13 -> KEY_RESERVED * 0x5c or 0x5d ( middle ) key: 14 -> KEY_ENTER * 0x5e or 0x5f ( >" ) key: 15 -> KEY_PLAYPAUSE * * Packets starting with 0x5 are part of a two-packets message, * we notify the caller by sending a negative value. */ int key = (data >> 1) & KEY_MASK; if ((data & TWO_PACKETS_MASK)) /* Part of a 2 packets-command */ key = -key; return key; } static void key_up(struct hid_device *hid, struct appleir *appleir, int key) { input_report_key(appleir->input_dev, key, 0); input_sync(appleir->input_dev); } static void key_down(struct hid_device *hid, struct appleir *appleir, int key) { input_report_key(appleir->input_dev, key, 1); input_sync(appleir->input_dev); } static void battery_flat(struct appleir *appleir) { dev_err(&appleir->input_dev->dev, "possible flat battery?\n"); } static void key_up_tick(struct timer_list *t) { struct appleir *appleir = from_timer(appleir, t, key_up_timer); struct hid_device *hid = appleir->hid; unsigned long flags; spin_lock_irqsave(&appleir->lock, flags); if (appleir->current_key) { key_up(hid, appleir, appleir->current_key); appleir->current_key = 0; } spin_unlock_irqrestore(&appleir->lock, flags); } static int appleir_raw_event(struct hid_device *hid, struct hid_report *report, u8 *data, int len) { struct appleir *appleir = hid_get_drvdata(hid); static const u8 keydown[] = { 0x25, 0x87, 0xee }; static const u8 keyrepeat[] = { 0x26, }; static const u8 flatbattery[] = { 0x25, 0x87, 0xe0 }; unsigned long flags; if (len != 5) goto out; if (!memcmp(data, keydown, sizeof(keydown))) { int index; spin_lock_irqsave(&appleir->lock, flags); /* * If we already have a key down, take it up before marking * this one down */ if (appleir->current_key) key_up(hid, appleir, appleir->current_key); /* Handle dual packet commands */ if (appleir->prev_key_idx > 0) index = appleir->prev_key_idx; else index = get_key(data[4]); if (index >= 0) { appleir->current_key = appleir->keymap[index]; key_down(hid, appleir, appleir->current_key); /* * Remote doesn't do key up, either pull them up, in * the test above, or here set a timer which pulls * them up after 1/8 s */ mod_timer(&appleir->key_up_timer, jiffies + HZ / 8); appleir->prev_key_idx = 0; } else /* Remember key for next packet */ appleir->prev_key_idx = -index; spin_unlock_irqrestore(&appleir->lock, flags); goto out; } appleir->prev_key_idx = 0; if (!memcmp(data, keyrepeat, sizeof(keyrepeat))) { key_down(hid, appleir, appleir->current_key); /* * Remote doesn't do key up, either pull them up, in the test * above, or here set a timer which pulls them up after 1/8 s */ mod_timer(&appleir->key_up_timer, jiffies + HZ / 8); goto out; } if (!memcmp(data, flatbattery, sizeof(flatbattery))) { battery_flat(appleir); /* Fall through */ } out: /* let hidraw and hiddev handle the report */ return 0; } static int appleir_input_configured(struct hid_device *hid, struct hid_input *hidinput) { struct input_dev *input_dev = hidinput->input; struct appleir *appleir = hid_get_drvdata(hid); int i; appleir->input_dev = input_dev; input_dev->keycode = appleir->keymap; input_dev->keycodesize = sizeof(unsigned short); input_dev->keycodemax = ARRAY_SIZE(appleir->keymap); input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); memcpy(appleir->keymap, appleir_key_table, sizeof(appleir->keymap)); for (i = 0; i < ARRAY_SIZE(appleir_key_table); i++) set_bit(appleir->keymap[i], input_dev->keybit); clear_bit(KEY_RESERVED, input_dev->keybit); return 0; } static int appleir_input_mapping(struct hid_device *hid, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { return -1; } static int appleir_probe(struct hid_device *hid, const struct hid_device_id *id) { int ret; struct appleir *appleir; appleir = devm_kzalloc(&hid->dev, sizeof(struct appleir), GFP_KERNEL); if (!appleir) return -ENOMEM; appleir->hid = hid; /* force input as some remotes bypass the input registration */ hid->quirks |= HID_QUIRK_HIDINPUT_FORCE; spin_lock_init(&appleir->lock); timer_setup(&appleir->key_up_timer, key_up_tick, 0); hid_set_drvdata(hid, appleir); ret = hid_parse(hid); if (ret) { hid_err(hid, "parse failed\n"); goto fail; } ret = hid_hw_start(hid, HID_CONNECT_DEFAULT | HID_CONNECT_HIDDEV_FORCE); if (ret) { hid_err(hid, "hw start failed\n"); goto fail; } return 0; fail: devm_kfree(&hid->dev, appleir); return ret; } static void appleir_remove(struct hid_device *hid) { struct appleir *appleir = hid_get_drvdata(hid); hid_hw_stop(hid); del_timer_sync(&appleir->key_up_timer); } static const struct hid_device_id appleir_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL2) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL3) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL4) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL5) }, { } }; MODULE_DEVICE_TABLE(hid, appleir_devices); static struct hid_driver appleir_driver = { .name = "appleir", .id_table = appleir_devices, .raw_event = appleir_raw_event, .input_configured = appleir_input_configured, .probe = appleir_probe, .remove = appleir_remove, .input_mapping = appleir_input_mapping, }; module_hid_driver(appleir_driver); |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat driver for Linux * * Copyright (c) 2010 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ /* * Module roccat is a char device used to report special events of roccat * hardware to userland. These events include requests for on-screen-display of * profile or dpi settings or requests for execution of macro sequences that are * not stored in device. The information in these events depends on hid device * implementation and contains data that is not available in a single hid event * or else hidraw could have been used. * It is inspired by hidraw, but uses only one circular buffer for all readers. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cdev.h> #include <linux/poll.h> #include <linux/sched/signal.h> #include <linux/hid-roccat.h> #include <linux/module.h> #define ROCCAT_FIRST_MINOR 0 #define ROCCAT_MAX_DEVICES 8 /* should be a power of 2 for performance reason */ #define ROCCAT_CBUF_SIZE 16 struct roccat_report { uint8_t *value; }; struct roccat_device { unsigned int minor; int report_size; int open; int exist; wait_queue_head_t wait; struct device *dev; struct hid_device *hid; struct list_head readers; /* protects modifications of readers list */ struct mutex readers_lock; /* * circular_buffer has one writer and multiple readers with their own * read pointers */ struct roccat_report cbuf[ROCCAT_CBUF_SIZE]; int cbuf_end; struct mutex cbuf_lock; }; struct roccat_reader { struct list_head node; struct roccat_device *device; int cbuf_start; }; static int roccat_major; static struct cdev roccat_cdev; static struct roccat_device *devices[ROCCAT_MAX_DEVICES]; /* protects modifications of devices array */ static DEFINE_MUTEX(devices_lock); static ssize_t roccat_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct roccat_reader *reader = file->private_data; struct roccat_device *device = reader->device; struct roccat_report *report; ssize_t retval = 0, len; DECLARE_WAITQUEUE(wait, current); mutex_lock(&device->cbuf_lock); /* no data? */ if (reader->cbuf_start == device->cbuf_end) { add_wait_queue(&device->wait, &wait); set_current_state(TASK_INTERRUPTIBLE); /* wait for data */ while (reader->cbuf_start == device->cbuf_end) { if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; break; } if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (!device->exist) { retval = -EIO; break; } mutex_unlock(&device->cbuf_lock); schedule(); mutex_lock(&device->cbuf_lock); set_current_state(TASK_INTERRUPTIBLE); } set_current_state(TASK_RUNNING); remove_wait_queue(&device->wait, &wait); } /* here we either have data or a reason to return if retval is set */ if (retval) goto exit_unlock; report = &device->cbuf[reader->cbuf_start]; /* * If report is larger than requested amount of data, rest of report * is lost! */ len = device->report_size > count ? count : device->report_size; if (copy_to_user(buffer, report->value, len)) { retval = -EFAULT; goto exit_unlock; } retval += len; reader->cbuf_start = (reader->cbuf_start + 1) % ROCCAT_CBUF_SIZE; exit_unlock: mutex_unlock(&device->cbuf_lock); return retval; } static __poll_t roccat_poll(struct file *file, poll_table *wait) { struct roccat_reader *reader = file->private_data; poll_wait(file, &reader->device->wait, wait); if (reader->cbuf_start != reader->device->cbuf_end) return EPOLLIN | EPOLLRDNORM; if (!reader->device->exist) return EPOLLERR | EPOLLHUP; return 0; } static int roccat_open(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); struct roccat_reader *reader; struct roccat_device *device; int error = 0; reader = kzalloc(sizeof(struct roccat_reader), GFP_KERNEL); if (!reader) return -ENOMEM; mutex_lock(&devices_lock); device = devices[minor]; if (!device) { pr_emerg("roccat device with minor %d doesn't exist\n", minor); error = -ENODEV; goto exit_err_devices; } mutex_lock(&device->readers_lock); if (!device->open++) { /* power on device on adding first reader */ error = hid_hw_power(device->hid, PM_HINT_FULLON); if (error < 0) { --device->open; goto exit_err_readers; } error = hid_hw_open(device->hid); if (error < 0) { hid_hw_power(device->hid, PM_HINT_NORMAL); --device->open; goto exit_err_readers; } } reader->device = device; /* new reader doesn't get old events */ reader->cbuf_start = device->cbuf_end; list_add_tail(&reader->node, &device->readers); file->private_data = reader; exit_err_readers: mutex_unlock(&device->readers_lock); exit_err_devices: mutex_unlock(&devices_lock); if (error) kfree(reader); return error; } static int roccat_release(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); struct roccat_reader *reader = file->private_data; struct roccat_device *device; mutex_lock(&devices_lock); device = devices[minor]; if (!device) { mutex_unlock(&devices_lock); pr_emerg("roccat device with minor %d doesn't exist\n", minor); return -ENODEV; } mutex_lock(&device->readers_lock); list_del(&reader->node); mutex_unlock(&device->readers_lock); kfree(reader); if (!--device->open) { /* removing last reader */ if (device->exist) { hid_hw_power(device->hid, PM_HINT_NORMAL); hid_hw_close(device->hid); } else { kfree(device); } } mutex_unlock(&devices_lock); return 0; } /* * roccat_report_event() - output data to readers * @minor: minor device number returned by roccat_connect() * @data: pointer to data * * Return value is zero on success, a negative error code on failure. * * This is called from interrupt handler. */ int roccat_report_event(int minor, u8 const *data) { struct roccat_device *device; struct roccat_reader *reader; struct roccat_report *report; uint8_t *new_value; device = devices[minor]; new_value = kmemdup(data, device->report_size, GFP_ATOMIC); if (!new_value) return -ENOMEM; mutex_lock(&device->cbuf_lock); report = &device->cbuf[device->cbuf_end]; /* passing NULL is safe */ kfree(report->value); report->value = new_value; device->cbuf_end = (device->cbuf_end + 1) % ROCCAT_CBUF_SIZE; list_for_each_entry(reader, &device->readers, node) { /* * As we already inserted one element, the buffer can't be * empty. If start and end are equal, buffer is full and we * increase start, so that slow reader misses one event, but * gets the newer ones in the right order. */ if (reader->cbuf_start == device->cbuf_end) reader->cbuf_start = (reader->cbuf_start + 1) % ROCCAT_CBUF_SIZE; } mutex_unlock(&device->cbuf_lock); wake_up_interruptible(&device->wait); return 0; } EXPORT_SYMBOL_GPL(roccat_report_event); /* * roccat_connect() - create a char device for special event output * @class: the class thats used to create the device. Meant to hold device * specific sysfs attributes. * @hid: the hid device the char device should be connected to. * @report_size: size of reports * * Return value is minor device number in Range [0, ROCCAT_MAX_DEVICES] on * success, a negative error code on failure. */ int roccat_connect(const struct class *klass, struct hid_device *hid, int report_size) { unsigned int minor; struct roccat_device *device; int temp; device = kzalloc(sizeof(struct roccat_device), GFP_KERNEL); if (!device) return -ENOMEM; mutex_lock(&devices_lock); for (minor = 0; minor < ROCCAT_MAX_DEVICES; ++minor) { if (devices[minor]) continue; break; } if (minor < ROCCAT_MAX_DEVICES) { devices[minor] = device; } else { mutex_unlock(&devices_lock); kfree(device); return -EINVAL; } device->dev = device_create(klass, &hid->dev, MKDEV(roccat_major, minor), NULL, "%s%s%d", "roccat", hid->driver->name, minor); if (IS_ERR(device->dev)) { devices[minor] = NULL; mutex_unlock(&devices_lock); temp = PTR_ERR(device->dev); kfree(device); return temp; } mutex_unlock(&devices_lock); init_waitqueue_head(&device->wait); INIT_LIST_HEAD(&device->readers); mutex_init(&device->readers_lock); mutex_init(&device->cbuf_lock); device->minor = minor; device->hid = hid; device->exist = 1; device->cbuf_end = 0; device->report_size = report_size; return minor; } EXPORT_SYMBOL_GPL(roccat_connect); /* roccat_disconnect() - remove char device from hid device * @minor: the minor device number returned by roccat_connect() */ void roccat_disconnect(int minor) { struct roccat_device *device; mutex_lock(&devices_lock); device = devices[minor]; mutex_unlock(&devices_lock); device->exist = 0; /* TODO exist maybe not needed */ device_destroy(device->dev->class, MKDEV(roccat_major, minor)); mutex_lock(&devices_lock); devices[minor] = NULL; mutex_unlock(&devices_lock); if (device->open) { hid_hw_close(device->hid); wake_up_interruptible(&device->wait); } else { kfree(device); } } EXPORT_SYMBOL_GPL(roccat_disconnect); static long roccat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); struct roccat_device *device; unsigned int minor = iminor(inode); long retval = 0; mutex_lock(&devices_lock); device = devices[minor]; if (!device) { retval = -ENODEV; goto out; } switch (cmd) { case ROCCATIOCGREPSIZE: if (put_user(device->report_size, (int __user *)arg)) retval = -EFAULT; break; default: retval = -ENOTTY; } out: mutex_unlock(&devices_lock); return retval; } static const struct file_operations roccat_ops = { .owner = THIS_MODULE, .read = roccat_read, .poll = roccat_poll, .open = roccat_open, .release = roccat_release, .llseek = noop_llseek, .unlocked_ioctl = roccat_ioctl, }; static int __init roccat_init(void) { int retval; dev_t dev_id; retval = alloc_chrdev_region(&dev_id, ROCCAT_FIRST_MINOR, ROCCAT_MAX_DEVICES, "roccat"); if (retval < 0) { pr_warn("can't get major number\n"); goto error; } roccat_major = MAJOR(dev_id); cdev_init(&roccat_cdev, &roccat_ops); retval = cdev_add(&roccat_cdev, dev_id, ROCCAT_MAX_DEVICES); if (retval < 0) { pr_warn("cannot add cdev\n"); goto cleanup_alloc_chrdev_region; } return 0; cleanup_alloc_chrdev_region: unregister_chrdev_region(dev_id, ROCCAT_MAX_DEVICES); error: return retval; } static void __exit roccat_exit(void) { dev_t dev_id = MKDEV(roccat_major, 0); cdev_del(&roccat_cdev); unregister_chrdev_region(dev_id, ROCCAT_MAX_DEVICES); } module_init(roccat_init); module_exit(roccat_exit); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat char device"); MODULE_LICENSE("GPL v2"); |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> * * The code this is based on carried the following copyright notice: * --- * (C) Copyright 2001-2006 * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com * Re-worked by Ben Greear <greearb@candelatech.com> * --- */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/net_tstamp.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/if_link.h> #include <linux/if_macvlan.h> #include <linux/hash.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <linux/netpoll.h> #include <linux/phy.h> #define MACVLAN_HASH_BITS 8 #define MACVLAN_HASH_SIZE (1<<MACVLAN_HASH_BITS) #define MACVLAN_DEFAULT_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 #define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; struct list_head vlans; struct sk_buff_head bc_queue; struct work_struct bc_work; u32 bc_queue_len_used; int bc_cutoff; u32 flags; int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(bc_filter, MACVLAN_MC_FILTER_SZ); DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { struct hlist_node hlist; struct macvlan_dev *vlan; unsigned char addr[6+2] __aligned(sizeof(u16)); struct rcu_head rcu; }; struct macvlan_skb_cb { const struct macvlan_dev *src; }; #define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0])) static void macvlan_port_destroy(struct net_device *dev); static void update_port_bc_queue_len(struct macvlan_port *port); static inline bool macvlan_passthru(const struct macvlan_port *port) { return port->flags & MACVLAN_F_PASSTHRU; } static inline void macvlan_set_passthru(struct macvlan_port *port) { port->flags |= MACVLAN_F_PASSTHRU; } static inline bool macvlan_addr_change(const struct macvlan_port *port) { return port->flags & MACVLAN_F_ADDRCHANGE; } static inline void macvlan_set_addr_change(struct macvlan_port *port) { port->flags |= MACVLAN_F_ADDRCHANGE; } static inline void macvlan_clear_addr_change(struct macvlan_port *port) { port->flags &= ~MACVLAN_F_ADDRCHANGE; } /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, MACVLAN_HASH_BITS); } static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static struct macvlan_port *macvlan_port_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; u32 idx = macvlan_eth_hash(addr); hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } static struct macvlan_source_entry *macvlan_hash_lookup_source( const struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(entry->addr, addr) && entry->vlan == vlan) return entry; } return NULL; } static int macvlan_hash_add_source(struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_port *port = vlan->port; struct macvlan_source_entry *entry; struct hlist_head *h; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) return 0; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; ether_addr_copy(entry->addr, addr); entry->vlan = vlan; h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; hlist_add_head_rcu(&entry->hlist, h); vlan->macaddr_count++; return 0; } static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; u32 idx = macvlan_eth_hash(addr); hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); } static void macvlan_hash_del_source(struct macvlan_source_entry *entry) { hlist_del_rcu(&entry->hlist); kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) { hlist_del_rcu(&vlan->hlist); if (sync) synchronize_rcu(); } static void macvlan_hash_change_addr(struct macvlan_dev *vlan, const unsigned char *addr) { macvlan_hash_del(vlan, true); /* Now that we are unhashed it is safe to change the device * address without confusing packet delivery. */ eth_hw_addr_set(vlan->dev, addr); macvlan_hash_add(vlan); } static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; if (macvlan_hash_lookup(port, addr)) return true; return false; } static int macvlan_broadcast_one(struct sk_buff *skb, const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { struct net_device *dev = vlan->dev; if (local) return __dev_forward_skb(dev, skb); skb->dev = dev; if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; return 0; } static u32 macvlan_hash_mix(const struct macvlan_dev *vlan) { return (u32)(((unsigned long)vlan) >> L1_CACHE_SHIFT); } static unsigned int mc_hash(const struct macvlan_dev *vlan, const unsigned char *addr) { u32 val = __get_unaligned_cpu32(addr + 2); val ^= macvlan_hash_mix(vlan); return hash_32(val, MACVLAN_MC_FILTER_BITS); } static void macvlan_broadcast(struct sk_buff *skb, const struct macvlan_port *port, struct net_device *src, enum macvlan_mode mode) { const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; struct sk_buff *nskb; unsigned int i; int err; unsigned int hash; if (skb->protocol == htons(ETH_P_PAUSE)) return; hash_for_each_rcu(port->vlan_hash, i, vlan, hlist) { if (vlan->dev == src || !(vlan->mode & mode)) continue; hash = mc_hash(vlan, eth->h_dest); if (!test_bit(hash, vlan->mc_filter)) continue; err = NET_RX_DROP; nskb = skb_clone(skb, GFP_ATOMIC); if (likely(nskb)) err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE) ?: netif_rx(nskb); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, true); } } static void macvlan_multicast_rx(const struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { if (!src) /* frame comes from an external address */ macvlan_broadcast(skb, port, NULL, MACVLAN_MODE_PRIVATE | MACVLAN_MODE_VEPA | MACVLAN_MODE_PASSTHRU| MACVLAN_MODE_BRIDGE); else if (src->mode == MACVLAN_MODE_VEPA) /* flood to everyone except source */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA | MACVLAN_MODE_BRIDGE); else /* * flood only to VEPA ports, bridge ports * already saw the frame on the way out. */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA); } static void macvlan_process_broadcast(struct work_struct *w) { struct macvlan_port *port = container_of(w, struct macvlan_port, bc_work); struct sk_buff *skb; struct sk_buff_head list; __skb_queue_head_init(&list); spin_lock_bh(&port->bc_queue.lock); skb_queue_splice_tail_init(&port->bc_queue, &list); spin_unlock_bh(&port->bc_queue.lock); while ((skb = __skb_dequeue(&list))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; rcu_read_lock(); macvlan_multicast_rx(port, src, skb); rcu_read_unlock(); if (src) dev_put(src->dev); consume_skb(skb); cond_resched(); } } static void macvlan_broadcast_enqueue(struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { struct sk_buff *nskb; int err = -ENOMEM; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) goto err; MACVLAN_SKB_CB(nskb)->src = src; spin_lock(&port->bc_queue.lock); if (skb_queue_len(&port->bc_queue) < port->bc_queue_len_used) { if (src) dev_hold(src->dev); __skb_queue_tail(&port->bc_queue, nskb); err = 0; } spin_unlock(&port->bc_queue.lock); queue_work(system_unbound_wq, &port->bc_work); if (err) goto free_nskb; return; free_nskb: kfree_skb(nskb); err: dev_core_stats_rx_dropped_inc(skb->dev); } static void macvlan_flush_sources(struct macvlan_port *port, struct macvlan_dev *vlan) { struct macvlan_source_entry *entry; struct hlist_node *next; int i; hash_for_each_safe(port->vlan_source_hash, i, next, entry, hlist) if (entry->vlan == vlan) macvlan_hash_del_source(entry); vlan->macaddr_count = 0; } static void macvlan_forward_source_one(struct sk_buff *skb, struct macvlan_dev *vlan) { struct sk_buff *nskb; struct net_device *dev; int len; int ret; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; len = nskb->len + ETH_HLEN; nskb->dev = dev; if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr)) nskb->pkt_type = PACKET_HOST; ret = __netif_rx(nskb); macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { if (ether_addr_equal_64bits(entry->addr, addr)) { if (entry->vlan->flags & MACVLAN_FLAG_NODST) consume = true; macvlan_forward_source_one(skb, entry->vlan); } } return consume; } /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { struct macvlan_port *port; struct sk_buff *skb = *pskb; const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; const struct macvlan_dev *src; struct net_device *dev; unsigned int len = 0; int ret; rx_handler_result_t handle_res; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; port = macvlan_port_get_rcu(skb->dev); if (is_multicast_ether_addr(eth->h_dest)) { unsigned int hash; skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { /* forward to original port. */ vlan = src; ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?: __netif_rx(skb); handle_res = RX_HANDLER_CONSUMED; goto out; } hash = mc_hash(NULL, eth->h_dest); if (test_bit(hash, port->bc_filter)) macvlan_broadcast_enqueue(port, src, skb); else if (test_bit(hash, port->mc_filter)) macvlan_multicast_rx(port, src, skb); return RX_HANDLER_PASS; } if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); else vlan = macvlan_hash_lookup(port, eth->h_dest); if (!vlan || vlan->mode == MACVLAN_MODE_SOURCE) return RX_HANDLER_PASS; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } len = skb->len + ETH_HLEN; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { ret = NET_RX_DROP; handle_res = RX_HANDLER_CONSUMED; goto out; } *pskb = skb; skb->dev = dev; skb->pkt_type = PACKET_HOST; ret = NET_RX_SUCCESS; handle_res = RX_HANDLER_ANOTHER; out: macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); return handle_res; } static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { const struct macvlan_dev *vlan = netdev_priv(dev); const struct macvlan_port *port = vlan->port; const struct macvlan_dev *dest; if (vlan->mode == MACVLAN_MODE_BRIDGE) { const struct ethhdr *eth = skb_eth_hdr(skb); /* send to other bridge ports directly */ if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE); goto xmit_world; } dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { /* send to lowerdev first for its network taps */ dev_forward_skb(vlan->lowerdev, skb); return NET_XMIT_SUCCESS; } } xmit_world: skb->dev = vlan->lowerdev; return dev_queue_xmit_accel(skb, netdev_get_sb_channel(dev) ? dev : NULL); } static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); unsigned int len = skb->len; int ret; if (unlikely(netpoll_tx_running(dev))) return macvlan_netpoll_send_skb(vlan, skb); ret = macvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(vlan->pcpu_stats->tx_dropped); } return ret; } static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return dev_hard_header(skb, lowerdev, type, daddr, saddr ? : dev->dev_addr, len); } static const struct header_ops macvlan_hard_header_ops = { .create = macvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; static int macvlan_open(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; int err; if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto out; } goto hash_add; } err = -EADDRINUSE; if (macvlan_addr_busy(vlan->port, dev->dev_addr)) goto out; /* Attempt to populate accel_priv which is used to offload the L2 * forwarding requests for unicast packets. */ if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) vlan->accel_priv = lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev); /* If earlier attempt to offload failed, or accel_priv is not * populated we must add the unicast address to the lower device. */ if (IS_ERR_OR_NULL(vlan->accel_priv)) { vlan->accel_priv = NULL; err = dev_uc_add(lowerdev, dev->dev_addr); if (err < 0) goto out; } if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(lowerdev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto clear_multi; } hash_add: macvlan_hash_add(vlan); return 0; clear_multi: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); del_unicast: if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } else { dev_uc_del(lowerdev, dev->dev_addr); } out: return err; } static int macvlan_stop(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } dev_uc_unsync(lowerdev, dev); dev_mc_unsync(lowerdev, dev); if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) dev_set_promiscuity(lowerdev, -1); goto hash_del; } if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(lowerdev, -1); dev_uc_del(lowerdev, dev->dev_addr); hash_del: macvlan_hash_del(vlan, !dev->dismantle); return 0; } static int macvlan_sync_address(struct net_device *dev, const unsigned char *addr) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; int err; if (!(dev->flags & IFF_UP)) { /* Just copy in the new address */ eth_hw_addr_set(dev, addr); } else { /* Rehash and update the device filters */ if (macvlan_addr_busy(vlan->port, addr)) return -EADDRINUSE; if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; dev_uc_del(lowerdev, dev->dev_addr); } macvlan_hash_change_addr(vlan, addr); } if (macvlan_passthru(port) && !macvlan_addr_change(port)) { /* Since addr_change isn't set, we are here due to lower * device change. Save the lower-dev address so we can * restore it later. */ ether_addr_copy(vlan->port->perm_addr, lowerdev->dev_addr); } macvlan_clear_addr_change(port); return 0; } static int macvlan_set_mac_address(struct net_device *dev, void *p) { struct macvlan_dev *vlan = netdev_priv(dev); struct sockaddr *addr = p; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; /* If the addresses are the same, this is a no-op */ if (ether_addr_equal(dev->dev_addr, addr->sa_data)) return 0; if (vlan->mode == MACVLAN_MODE_PASSTHRU) { macvlan_set_addr_change(vlan->port); return dev_set_mac_address(vlan->lowerdev, addr, NULL); } if (macvlan_addr_busy(vlan->port, addr->sa_data)) return -EADDRINUSE; return macvlan_sync_address(dev, addr->sa_data); } static void macvlan_change_rx_flags(struct net_device *dev, int change) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC) dev_set_promiscuity(lowerdev, dev->flags & IFF_PROMISC ? 1 : -1); } } static void macvlan_compute_filter(unsigned long *mc_filter, struct net_device *dev, struct macvlan_dev *vlan, int cutoff) { if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ); } else { DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ); struct netdev_hw_addr *ha; bitmap_zero(filter, MACVLAN_MC_FILTER_SZ); netdev_for_each_mc_addr(ha, dev) { if (!vlan && ha->synced <= cutoff) continue; __set_bit(mc_hash(vlan, ha->addr), filter); } __set_bit(mc_hash(vlan, dev->broadcast), filter); bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ); } } static void macvlan_recompute_bc_filter(struct macvlan_dev *vlan) { if (vlan->port->bc_cutoff < 0) { bitmap_zero(vlan->port->bc_filter, MACVLAN_MC_FILTER_SZ); return; } macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL, vlan->port->bc_cutoff); } static void macvlan_set_mac_lists(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); macvlan_compute_filter(vlan->mc_filter, dev, vlan, 0); dev_uc_sync(vlan->lowerdev, dev); dev_mc_sync(vlan->lowerdev, dev); /* This is slightly inaccurate as we're including the subscription * list of vlan->lowerdev too. * * Bug alert: This only works if everyone has the same broadcast * address as lowerdev. As soon as someone changes theirs this * will break. * * However, this is already broken as when you change your broadcast * address we don't get called. * * The solution is to maintain a list of broadcast addresses like * we do for uc/mc, if you care. */ macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL, 0); macvlan_recompute_bc_filter(vlan); } static void update_port_bc_cutoff(struct macvlan_dev *vlan, int cutoff) { if (vlan->port->bc_cutoff == cutoff) return; vlan->port->bc_cutoff = cutoff; macvlan_recompute_bc_filter(vlan); } static int macvlan_change_mtu(struct net_device *dev, int new_mtu) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->lowerdev->mtu < new_mtu) return -EINVAL; dev->mtu = new_mtu; return 0; } static int macvlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return generic_hwtstamp_get_lower(real_dev, cfg); } static int macvlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = macvlan_dev_real_dev(dev); if (!net_eq(dev_net(dev), &init_net)) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } /* * macvlan network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key macvlan_netdev_addr_lock_key; #define ALWAYS_ON_OFFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) #define ALWAYS_ON_FEATURES (ALWAYS_ON_OFFLOADS | NETIF_F_LLTX) #define MACVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_LRO | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static void macvlan_set_lockdep_class(struct net_device *dev) { netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macvlan_netdev_addr_lock_key); } static int macvlan_init(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; dev->state = (dev->state & ~MACVLAN_STATE_MASK) | (lowerdev->state & MACVLAN_STATE_MASK); dev->features = lowerdev->features & MACVLAN_FEATURES; dev->features |= ALWAYS_ON_FEATURES; dev->hw_features |= NETIF_F_LRO; dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; netif_inherit_tso_max(dev, lowerdev); dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; port->count += 1; /* Get macvlan's reference to lowerdev */ netdev_hold(lowerdev, &vlan->dev_tracker, GFP_KERNEL); return 0; } static void macvlan_uninit(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; free_percpu(vlan->pcpu_stats); macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); } static void macvlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->pcpu_stats) { struct vlan_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_errors = 0, tx_dropped = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(vlan->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* rx_errors & tx_dropped are u32, updated * without syncp protection. */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->rx_dropped = rx_errors; stats->tx_dropped = tx_dropped; } } static int macvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return vlan_vid_add(lowerdev, proto, vid); } static int macvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; vlan_vid_del(lowerdev, proto, vid); return 0; } static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); return err; } static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } static void macvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "macvlan", sizeof(drvinfo->driver)); strscpy(drvinfo->version, "0.1", sizeof(drvinfo->version)); } static int macvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct macvlan_dev *vlan = netdev_priv(dev); return __ethtool_get_link_ksettings(vlan->lowerdev, cmd); } static int macvlan_ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return ethtool_get_ts_info_by_layer(real_dev, info); } static netdev_features_t macvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct macvlan_dev *vlan = netdev_priv(dev); netdev_features_t lowerdev_features = vlan->lowerdev->features; netdev_features_t mask; features |= NETIF_F_ALL_FOR_ALL; features &= (vlan->set_features | ~MACVLAN_FEATURES); mask = features; lowerdev_features &= (features | ~NETIF_F_LRO); features = netdev_increment_features(lowerdev_features, features, mask); features |= ALWAYS_ON_FEATURES; features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES); return features; } #ifdef CONFIG_NET_POLL_CONTROLLER static void macvlan_dev_poll_controller(struct net_device *dev) { return; } static int macvlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *real_dev = vlan->lowerdev; struct netpoll *netpoll; int err; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void macvlan_dev_netpoll_cleanup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int macvlan_dev_get_iflink(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return vlan->lowerdev->ifindex; } static const struct ethtool_ops macvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = macvlan_ethtool_get_link_ksettings, .get_drvinfo = macvlan_ethtool_get_drvinfo, .get_ts_info = macvlan_ethtool_get_ts_info, }; static const struct net_device_ops macvlan_netdev_ops = { .ndo_init = macvlan_init, .ndo_uninit = macvlan_uninit, .ndo_open = macvlan_open, .ndo_stop = macvlan_stop, .ndo_start_xmit = macvlan_start_xmit, .ndo_change_mtu = macvlan_change_mtu, .ndo_fix_features = macvlan_fix_features, .ndo_change_rx_flags = macvlan_change_rx_flags, .ndo_set_mac_address = macvlan_set_mac_address, .ndo_set_rx_mode = macvlan_set_mac_lists, .ndo_get_stats64 = macvlan_dev_get_stats64, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = macvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = macvlan_vlan_rx_kill_vid, .ndo_fdb_add = macvlan_fdb_add, .ndo_fdb_del = macvlan_fdb_del, .ndo_fdb_dump = ndo_dflt_fdb_dump, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = macvlan_dev_poll_controller, .ndo_netpoll_setup = macvlan_dev_netpoll_setup, .ndo_netpoll_cleanup = macvlan_dev_netpoll_cleanup, #endif .ndo_get_iflink = macvlan_dev_get_iflink, .ndo_features_check = passthru_features_check, .ndo_hwtstamp_get = macvlan_hwtstamp_get, .ndo_hwtstamp_set = macvlan_hwtstamp_set, }; static void macvlan_dev_free(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); /* Get rid of the macvlan's reference to lowerdev */ netdev_put(vlan->lowerdev, &vlan->dev_tracker); } void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); /* ether_setup() has set dev->min_mtu to ETH_MIN_MTU. */ dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT | IFF_CHANGE_PROTO_DOWN; dev->netdev_ops = &macvlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = macvlan_dev_free; dev->header_ops = &macvlan_hard_header_ops; dev->ethtool_ops = &macvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(macvlan_common_setup); static void macvlan_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; } static int macvlan_port_create(struct net_device *dev) { struct macvlan_port *port; unsigned int i; int err; if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) return -EINVAL; if (netdev_is_rx_handler_busy(dev)) return -EBUSY; port = kzalloc(sizeof(*port), GFP_KERNEL); if (port == NULL) return -ENOMEM; port->dev = dev; ether_addr_copy(port->perm_addr, dev->dev_addr); INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_source_hash[i]); port->bc_queue_len_used = 0; port->bc_cutoff = 1; skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); if (err) kfree(port); else dev->priv_flags |= IFF_MACVLAN_PORT; return err; } static void macvlan_port_destroy(struct net_device *dev) { struct macvlan_port *port = macvlan_port_get_rtnl(dev); struct sk_buff *skb; dev->priv_flags &= ~IFF_MACVLAN_PORT; netdev_rx_handler_unregister(dev); /* After this point, no packet can schedule bc_work anymore, * but we need to cancel it and purge left skbs if any. */ cancel_work_sync(&port->bc_work); while ((skb = __skb_dequeue(&port->bc_queue))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; if (src) dev_put(src->dev); kfree_skb(skb); } /* If the lower device address has been changed by passthru * macvlan, put it back. */ if (macvlan_passthru(port) && !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) { struct sockaddr sa; sa.sa_family = port->dev->type; memcpy(&sa.sa_data, port->perm_addr, port->dev->addr_len); dev_set_mac_address(port->dev, &sa, NULL); } kfree(port); } static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *nla, *head; int rem, len; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; if (data[IFLA_MACVLAN_FLAGS] && nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) { case MACVLAN_MODE_PRIVATE: case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: case MACVLAN_MODE_SOURCE: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { case MACVLAN_MACADDR_ADD: case MACVLAN_MACADDR_DEL: case MACVLAN_MACADDR_FLUSH: case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR]) { if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) return -EADDRNOTAVAIL; } if (data[IFLA_MACVLAN_MACADDR_DATA]) { head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { if (nla_type(nla) != IFLA_MACVLAN_MACADDR || nla_len(nla) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(nla))) return -EADDRNOTAVAIL; } } if (data[IFLA_MACVLAN_MACADDR_COUNT]) return -EINVAL; return 0; } /* * reconfigure list of remote source mac address * (only for macvlan devices in source mode) * Note regarding alignment: all netlink data is aligned to 4 Byte, which * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. */ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, struct nlattr *data[]) { char *addr = NULL; int ret, rem, len; struct nlattr *nla, *head; struct macvlan_source_entry *entry; if (data[IFLA_MACVLAN_MACADDR]) addr = nla_data(data[IFLA_MACVLAN_MACADDR]); if (mode == MACVLAN_MACADDR_ADD) { if (!addr) return -EINVAL; return macvlan_hash_add_source(vlan, addr); } else if (mode == MACVLAN_MACADDR_DEL) { if (!addr) return -EINVAL; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) { macvlan_hash_del_source(entry); vlan->macaddr_count--; } } else if (mode == MACVLAN_MACADDR_FLUSH) { macvlan_flush_sources(vlan->port, vlan); } else if (mode == MACVLAN_MACADDR_SET) { macvlan_flush_sources(vlan->port, vlan); if (addr) { ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } if (!data[IFLA_MACVLAN_MACADDR_DATA]) return 0; head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { addr = nla_data(nla); ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } } else { return -EINVAL; } return 0; } int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port; struct net_device *lowerdev; int err; int macmode; bool create = false; if (!tb[IFLA_LINK]) return -EINVAL; lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (lowerdev == NULL) return -ENODEV; /* When creating macvlans or macvtaps on top of other macvlans - use * the real device as the lowerdev. */ if (netif_is_macvlan(lowerdev)) lowerdev = macvlan_dev_real_dev(lowerdev); if (!tb[IFLA_MTU]) dev->mtu = lowerdev->mtu; else if (dev->mtu > lowerdev->mtu) return -EINVAL; /* MTU range: 68 - lowerdev->max_mtu */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = lowerdev->max_mtu; if (!tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); if (!netif_is_macvlan_port(lowerdev)) { err = macvlan_port_create(lowerdev); if (err < 0) return err; create = true; } port = macvlan_port_get_rtnl(lowerdev); /* Only 1 macvlan device can be created in passthru mode */ if (macvlan_passthru(port)) { /* The macvlan port must be not created this time, * still goto destroy_macvlan_port for readability. */ err = -EINVAL; goto destroy_macvlan_port; } vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; vlan->set_features = MACVLAN_FEATURES; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); if (data && data[IFLA_MACVLAN_FLAGS]) vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { if (port->count) { err = -EINVAL; goto destroy_macvlan_port; } macvlan_set_passthru(port); eth_hw_addr_inherit(dev, lowerdev); } if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) { err = -EINVAL; goto destroy_macvlan_port; } macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); err = macvlan_changelink_sources(vlan, macmode, data); if (err) goto destroy_macvlan_port; } vlan->bc_queue_len_req = MACVLAN_DEFAULT_BC_QUEUE_LEN; if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); err = register_netdevice(dev); if (err < 0) goto destroy_macvlan_port; dev->priv_flags |= IFF_MACVLAN; err = netdev_upper_dev_link(lowerdev, dev, extack); if (err) goto unregister_netdev; list_add_tail_rcu(&vlan->list, &port->vlans); update_port_bc_queue_len(vlan->port); netif_stacked_transfer_operstate(lowerdev, dev); linkwatch_fire_event(dev); return 0; unregister_netdev: /* macvlan_uninit would free the macvlan port */ unregister_netdevice(dev); return err; destroy_macvlan_port: /* the macvlan port may be freed by macvlan_uninit when fail to register. * so we destroy the macvlan port only when it's valid. */ if (create && macvlan_port_get_rtnl(lowerdev)) { macvlan_flush_sources(port, vlan); macvlan_port_destroy(port->dev); } return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); static int macvlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return macvlan_common_newlink(src_net, dev, tb, data, extack); } void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->mode == MACVLAN_MODE_SOURCE) macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); update_port_bc_queue_len(vlan->port); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); } EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; enum macvlan_macaddr_mode macmode; int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { set_mode = true; mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); /* Passthrough mode can't be set or cleared dynamically */ if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; if (vlan->mode == MACVLAN_MODE_SOURCE && vlan->mode != mode) macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { __u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC; if (macvlan_passthru(vlan->port) && promisc) { int err; if (flags & MACVLAN_FLAG_NOPROMISC) err = dev_set_promiscuity(vlan->lowerdev, -1); else err = dev_set_promiscuity(vlan->lowerdev, 1); if (err < 0) return err; } vlan->flags = flags; } if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) { vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); update_port_bc_queue_len(vlan->port); } if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); if (set_mode) vlan->mode = mode; if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) return -EINVAL; macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); ret = macvlan_changelink_sources(vlan, macmode, data); if (ret) return ret; } return 0; } static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) { if (vlan->macaddr_count == 0) return 0; return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); } static size_t macvlan_get_size(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN_USED */ ); } static int macvlan_fill_info_macaddr(struct sk_buff *skb, const struct macvlan_dev *vlan, const int i) { struct hlist_head *h = &vlan->port->vlan_source_hash[i]; struct macvlan_source_entry *entry; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (entry->vlan != vlan) continue; if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) return 1; } return 0; } static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; int i; struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) goto nla_put_failure; if (vlan->macaddr_count > 0) { nest = nla_nest_start_noflag(skb, IFLA_MACVLAN_MACADDR_DATA); if (nest == NULL) goto nla_put_failure; for (i = 0; i < MACVLAN_HASH_SIZE; i++) { if (macvlan_fill_info_macaddr(skb, vlan, i)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN, vlan->bc_queue_len_req)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used)) goto nla_put_failure; if (port->bc_cutoff != 1 && nla_put_s32(skb, IFLA_MACVLAN_BC_CUTOFF, port->bc_cutoff)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN_USED] = { .type = NLA_REJECT }, [IFLA_MACVLAN_BC_CUTOFF] = { .type = NLA_S32 }, }; int macvlan_link_register(struct rtnl_link_ops *ops) { /* common fields */ ops->validate = macvlan_validate; ops->maxtype = IFLA_MACVLAN_MAX; ops->policy = macvlan_policy; ops->changelink = macvlan_changelink; ops->get_size = macvlan_get_size; ops->fill_info = macvlan_fill_info; return rtnl_link_register(ops); }; EXPORT_SYMBOL_GPL(macvlan_link_register); static struct net *macvlan_get_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", .setup = macvlan_setup, .newlink = macvlan_newlink, .dellink = macvlan_dellink, .get_link_net = macvlan_get_link_net, .priv_size = sizeof(struct macvlan_dev), }; static void update_port_bc_queue_len(struct macvlan_port *port) { u32 max_bc_queue_len_req = 0; struct macvlan_dev *vlan; list_for_each_entry(vlan, &port->vlans, list) { if (vlan->bc_queue_len_req > max_bc_queue_len_req) max_bc_queue_len_req = vlan->bc_queue_len_req; } port->bc_queue_len_used = max_bc_queue_len_req; } static int macvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvlan_dev *vlan, *next; struct macvlan_port *port; LIST_HEAD(list_kill); if (!netif_is_macvlan_port(dev)) return NOTIFY_DONE; port = macvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: list_for_each_entry(vlan, &port->vlans, list) netif_stacked_transfer_operstate(vlan->lowerdev, vlan->dev); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { netif_inherit_tso_max(vlan->dev, dev); netdev_update_features(vlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(vlan, &port->vlans, list) { if (vlan->dev->mtu <= dev->mtu) continue; dev_set_mtu(vlan->dev, dev->mtu); } break; case NETDEV_CHANGEADDR: if (!macvlan_passthru(port)) return NOTIFY_DONE; vlan = list_first_entry_or_null(&port->vlans, struct macvlan_dev, list); if (vlan && macvlan_sync_address(vlan->dev, dev->dev_addr)) return NOTIFY_BAD; break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(vlan, next, &port->vlans, list) vlan->dev->rtnl_link_ops->dellink(vlan->dev, &list_kill); unregister_netdevice_many(&list_kill); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to all vlans */ list_for_each_entry(vlan, &port->vlans, list) call_netdevice_notifiers(event, vlan->dev); } return NOTIFY_DONE; } static struct notifier_block macvlan_notifier_block __read_mostly = { .notifier_call = macvlan_device_event, }; static int __init macvlan_init_module(void) { int err; register_netdevice_notifier(&macvlan_notifier_block); err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; err1: unregister_netdevice_notifier(&macvlan_notifier_block); return err; } static void __exit macvlan_cleanup_module(void) { rtnl_link_unregister(&macvlan_link_ops); unregister_netdevice_notifier(&macvlan_notifier_block); } module_init(macvlan_init_module); module_exit(macvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Driver for MAC address based VLANs"); MODULE_ALIAS_RTNL_LINK("macvlan"); |
| 173 173 17 17 17 207 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | /* * include/linux/ktime.h * * ktime_t - nanosecond-resolution time format. * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes and macros. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * * Roman Zippel provided the ideas and primary code snippets of * the ktime_t union and further simplifications of the original * code. * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_KTIME_H #define _LINUX_KTIME_H #include <asm/bug.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/types.h> /** * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value * @secs: seconds to set * @nsecs: nanoseconds to set * * Return: The ktime_t representation of the value. */ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) { if (unlikely(secs >= KTIME_SEC_MAX)) return KTIME_MAX; return secs * NSEC_PER_SEC + (s64)nsecs; } /* Subtract two ktime_t variables. rem = lhs -rhs: */ #define ktime_sub(lhs, rhs) ((lhs) - (rhs)) /* Add two ktime_t variables. res = lhs + rhs: */ #define ktime_add(lhs, rhs) ((lhs) + (rhs)) /* * Same as ktime_add(), but avoids undefined behaviour on overflow; however, * this means that you must check the result for overflow yourself. */ #define ktime_add_unsafe(lhs, rhs) ((u64) (lhs) + (rhs)) /* * Add a ktime_t variable and a scalar nanosecond value. * res = kt + nsval: */ #define ktime_add_ns(kt, nsval) ((kt) + (nsval)) /* * Subtract a scalar nanosecod from a ktime_t variable * res = kt - nsval: */ #define ktime_sub_ns(kt, nsval) ((kt) - (nsval)) /* convert a timespec64 to ktime_t format: */ static inline ktime_t timespec64_to_ktime(struct timespec64 ts) { return ktime_set(ts.tv_sec, ts.tv_nsec); } /* Map the ktime_t to timespec conversion to ns_to_timespec function */ #define ktime_to_timespec64(kt) ns_to_timespec64((kt)) /* Convert ktime_t to nanoseconds */ static inline s64 ktime_to_ns(const ktime_t kt) { return kt; } /** * ktime_compare - Compares two ktime_t variables for less, greater or equal * @cmp1: comparable1 * @cmp2: comparable2 * * Return: ... * cmp1 < cmp2: return <0 * cmp1 == cmp2: return 0 * cmp1 > cmp2: return >0 */ static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2) { if (cmp1 < cmp2) return -1; if (cmp1 > cmp2) return 1; return 0; } /** * ktime_after - Compare if a ktime_t value is bigger than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened after cmp2. */ static inline bool ktime_after(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) > 0; } /** * ktime_before - Compare if a ktime_t value is smaller than another one. * @cmp1: comparable1 * @cmp2: comparable2 * * Return: true if cmp1 happened before cmp2. */ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2) { return ktime_compare(cmp1, cmp2) < 0; } #if BITS_PER_LONG < 64 extern s64 __ktime_divns(const ktime_t kt, s64 div); static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * Negative divisors could cause an inf loop, * so bug out here. */ BUG_ON(div < 0); if (__builtin_constant_p(div) && !(div >> 32)) { s64 ns = kt; u64 tmp = ns < 0 ? -ns : ns; do_div(tmp, div); return ns < 0 ? -tmp : tmp; } else { return __ktime_divns(kt, div); } } #else /* BITS_PER_LONG < 64 */ static inline s64 ktime_divns(const ktime_t kt, s64 div) { /* * 32-bit implementation cannot handle negative divisors, * so catch them on 64bit as well. */ WARN_ON(div < 0); return kt / div; } #endif static inline s64 ktime_to_us(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_USEC); } static inline s64 ktime_to_ms(const ktime_t kt) { return ktime_divns(kt, NSEC_PER_MSEC); } static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_us(ktime_sub(later, earlier)); } static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier) { return ktime_to_ms(ktime_sub(later, earlier)); } static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec) { return ktime_add_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec) { return ktime_add_ns(kt, msec * NSEC_PER_MSEC); } static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) { return ktime_sub_ns(kt, usec * NSEC_PER_USEC); } static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) { return ktime_sub_ns(kt, msec * NSEC_PER_MSEC); } extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); /** * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64 * format only if the variable contains data * @kt: the ktime_t variable to convert * @ts: the timespec variable to store the result in * * Return: %true if there was a successful conversion, %false if kt was 0. */ static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt, struct timespec64 *ts) { if (kt) { *ts = ktime_to_timespec64(kt); return true; } else { return false; } } #include <vdso/ktime.h> static inline ktime_t ns_to_ktime(u64 ns) { return ns; } static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; } # include <linux/timekeeping.h> #endif |
| 588 587 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * Driver for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright (C) 2001 Russell King. */ #include <linux/bits.h> #include <linux/serial_8250.h> #include <linux/serial_reg.h> #include <linux/dmaengine.h> #include "../serial_mctrl_gpio.h" struct uart_8250_dma { int (*tx_dma)(struct uart_8250_port *p); int (*rx_dma)(struct uart_8250_port *p); void (*prepare_tx_dma)(struct uart_8250_port *p); void (*prepare_rx_dma)(struct uart_8250_port *p); /* Filter function */ dma_filter_fn fn; /* Parameter to the filter function */ void *rx_param; void *tx_param; struct dma_slave_config rxconf; struct dma_slave_config txconf; struct dma_chan *rxchan; struct dma_chan *txchan; /* Device address base for DMA operations */ phys_addr_t rx_dma_addr; phys_addr_t tx_dma_addr; /* DMA address of the buffer in memory */ dma_addr_t rx_addr; dma_addr_t tx_addr; dma_cookie_t rx_cookie; dma_cookie_t tx_cookie; void *rx_buf; size_t rx_size; size_t tx_size; unsigned char tx_running; unsigned char tx_err; unsigned char rx_running; }; struct old_serial_port { unsigned int uart; unsigned int baud_base; unsigned int port; unsigned int irq; upf_t flags; unsigned char io_type; unsigned char __iomem *iomem_base; unsigned short iomem_reg_shift; }; struct serial8250_config { const char *name; unsigned short fifo_size; unsigned short tx_loadsz; unsigned char fcr; unsigned char rxtrig_bytes[UART_FCR_R_TRIG_MAX_STATE]; unsigned int flags; }; #define UART_CAP_FIFO BIT(8) /* UART has FIFO */ #define UART_CAP_EFR BIT(9) /* UART has EFR */ #define UART_CAP_SLEEP BIT(10) /* UART has IER sleep */ #define UART_CAP_AFE BIT(11) /* MCR-based hw flow control */ #define UART_CAP_UUE BIT(12) /* UART needs IER bit 6 set (Xscale) */ #define UART_CAP_RTOIE BIT(13) /* UART needs IER bit 4 set (Xscale, Tegra) */ #define UART_CAP_HFIFO BIT(14) /* UART has a "hidden" FIFO */ #define UART_CAP_RPM BIT(15) /* Runtime PM is active while idle */ #define UART_CAP_IRDA BIT(16) /* UART supports IrDA line discipline */ #define UART_CAP_MINI BIT(17) /* Mini UART on BCM283X family lacks: * STOP PARITY EPAR SPAR WLEN5 WLEN6 */ #define UART_CAP_NOTEMT BIT(18) /* UART without interrupt on TEMT available */ #define UART_BUG_QUOT BIT(0) /* UART has buggy quot LSB */ #define UART_BUG_TXEN BIT(1) /* UART has buggy TX IIR status */ #define UART_BUG_NOMSR BIT(2) /* UART has buggy MSR status bits (Au1x00) */ #define UART_BUG_THRE BIT(3) /* UART has buggy THRE reassertion */ #define UART_BUG_TXRACE BIT(5) /* UART Tx fails to set remote DR */ #ifdef CONFIG_SERIAL_8250_SHARE_IRQ #define SERIAL8250_SHARE_IRQS 1 #else #define SERIAL8250_SHARE_IRQS 0 #endif #define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \ { \ .iobase = _base, \ .irq = _irq, \ .uartclk = 1843200, \ .iotype = UPIO_PORT, \ .flags = UPF_BOOT_AUTOCONF | (_flags), \ } #define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0) static inline int serial_in(struct uart_8250_port *up, int offset) { return up->port.serial_in(&up->port, offset); } static inline void serial_out(struct uart_8250_port *up, int offset, int value) { up->port.serial_out(&up->port, offset, value); } /** * serial_lsr_in - Read LSR register and preserve flags across reads * @up: uart 8250 port * * Read LSR register and handle saving non-preserved flags across reads. * The flags that are not preserved across reads are stored into * up->lsr_saved_flags. * * Returns LSR value or'ed with the preserved flags (if any). */ static inline u16 serial_lsr_in(struct uart_8250_port *up) { u16 lsr = up->lsr_saved_flags; lsr |= serial_in(up, UART_LSR); up->lsr_saved_flags = lsr & up->lsr_save_mask; return lsr; } /* * For the 16C950 */ static void serial_icr_write(struct uart_8250_port *up, int offset, int value) { serial_out(up, UART_SCR, offset); serial_out(up, UART_ICR, value); } static unsigned int __maybe_unused serial_icr_read(struct uart_8250_port *up, int offset) { unsigned int value; serial_icr_write(up, UART_ACR, up->acr | UART_ACR_ICRRD); serial_out(up, UART_SCR, offset); value = serial_in(up, UART_ICR); serial_icr_write(up, UART_ACR, up->acr); return value; } void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p); static inline u32 serial_dl_read(struct uart_8250_port *up) { return up->dl_read(up); } static inline void serial_dl_write(struct uart_8250_port *up, u32 value) { up->dl_write(up, value); } static inline bool serial8250_set_THRI(struct uart_8250_port *up) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } static inline bool serial8250_clear_THRI(struct uart_8250_port *up) { /* Port locked to synchronize UART_IER access against the console. */ lockdep_assert_held_once(&up->port.lock); if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; serial_out(up, UART_IER, up->ier); return true; } struct uart_8250_port *serial8250_get_port(int line); void serial8250_rpm_get(struct uart_8250_port *p); void serial8250_rpm_put(struct uart_8250_port *p); void serial8250_rpm_get_tx(struct uart_8250_port *p); void serial8250_rpm_put_tx(struct uart_8250_port *p); int serial8250_em485_config(struct uart_port *port, struct ktermios *termios, struct serial_rs485 *rs485); void serial8250_em485_start_tx(struct uart_8250_port *p); void serial8250_em485_stop_tx(struct uart_8250_port *p); void serial8250_em485_destroy(struct uart_8250_port *p); extern struct serial_rs485 serial8250_em485_supported; /* MCR <-> TIOCM conversion */ static inline int serial8250_TIOCM_to_MCR(int tiocm) { int mcr = 0; if (tiocm & TIOCM_RTS) mcr |= UART_MCR_RTS; if (tiocm & TIOCM_DTR) mcr |= UART_MCR_DTR; if (tiocm & TIOCM_OUT1) mcr |= UART_MCR_OUT1; if (tiocm & TIOCM_OUT2) mcr |= UART_MCR_OUT2; if (tiocm & TIOCM_LOOP) mcr |= UART_MCR_LOOP; return mcr; } static inline int serial8250_MCR_to_TIOCM(int mcr) { int tiocm = 0; if (mcr & UART_MCR_RTS) tiocm |= TIOCM_RTS; if (mcr & UART_MCR_DTR) tiocm |= TIOCM_DTR; if (mcr & UART_MCR_OUT1) tiocm |= TIOCM_OUT1; if (mcr & UART_MCR_OUT2) tiocm |= TIOCM_OUT2; if (mcr & UART_MCR_LOOP) tiocm |= TIOCM_LOOP; return tiocm; } /* MSR <-> TIOCM conversion */ static inline int serial8250_MSR_to_TIOCM(int msr) { int tiocm = 0; if (msr & UART_MSR_DCD) tiocm |= TIOCM_CAR; if (msr & UART_MSR_RI) tiocm |= TIOCM_RNG; if (msr & UART_MSR_DSR) tiocm |= TIOCM_DSR; if (msr & UART_MSR_CTS) tiocm |= TIOCM_CTS; return tiocm; } static inline void serial8250_out_MCR(struct uart_8250_port *up, int value) { serial_out(up, UART_MCR, value); if (up->gpios) mctrl_gpio_set(up->gpios, serial8250_MCR_to_TIOCM(value)); } static inline int serial8250_in_MCR(struct uart_8250_port *up) { int mctrl; mctrl = serial_in(up, UART_MCR); if (up->gpios) { unsigned int mctrl_gpio = 0; mctrl_gpio = mctrl_gpio_get_outputs(up->gpios, &mctrl_gpio); mctrl |= serial8250_TIOCM_to_MCR(mctrl_gpio); } return mctrl; } bool alpha_jensen(void); void alpha_jensen_set_mctrl(struct uart_port *port, unsigned int mctrl); #ifdef CONFIG_SERIAL_8250_PNP int serial8250_pnp_init(void); void serial8250_pnp_exit(void); #else static inline int serial8250_pnp_init(void) { return 0; } static inline void serial8250_pnp_exit(void) { } #endif #ifdef CONFIG_SERIAL_8250_FINTEK int fintek_8250_probe(struct uart_8250_port *uart); #else static inline int fintek_8250_probe(struct uart_8250_port *uart) { return 0; } #endif #ifdef CONFIG_ARCH_OMAP1 #include <linux/soc/ti/omap1-soc.h> static inline int is_omap1_8250(struct uart_8250_port *pt) { int res; switch (pt->port.mapbase) { case OMAP1_UART1_BASE: case OMAP1_UART2_BASE: case OMAP1_UART3_BASE: res = 1; break; default: res = 0; break; } return res; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { if (!cpu_is_omap1510()) return 0; return is_omap1_8250(pt); } #else static inline int is_omap1_8250(struct uart_8250_port *pt) { return 0; } static inline int is_omap1510_8250(struct uart_8250_port *pt) { return 0; } #endif #ifdef CONFIG_SERIAL_8250_DMA extern int serial8250_tx_dma(struct uart_8250_port *); extern int serial8250_rx_dma(struct uart_8250_port *); extern void serial8250_rx_dma_flush(struct uart_8250_port *); extern int serial8250_request_dma(struct uart_8250_port *); extern void serial8250_release_dma(struct uart_8250_port *); static inline void serial8250_do_prepare_tx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; if (dma->prepare_tx_dma) dma->prepare_tx_dma(p); } static inline void serial8250_do_prepare_rx_dma(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; if (dma->prepare_rx_dma) dma->prepare_rx_dma(p); } static inline bool serial8250_tx_dma_running(struct uart_8250_port *p) { struct uart_8250_dma *dma = p->dma; return dma && dma->tx_running; } #else static inline int serial8250_tx_dma(struct uart_8250_port *p) { return -1; } static inline int serial8250_rx_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_rx_dma_flush(struct uart_8250_port *p) { } static inline int serial8250_request_dma(struct uart_8250_port *p) { return -1; } static inline void serial8250_release_dma(struct uart_8250_port *p) { } static inline bool serial8250_tx_dma_running(struct uart_8250_port *p) { return false; } #endif static inline int ns16550a_goto_highspeed(struct uart_8250_port *up) { unsigned char status; status = serial_in(up, 0x04); /* EXCR2 */ #define PRESL(x) ((x) & 0x30) if (PRESL(status) == 0x10) { /* already in high speed mode */ return 0; } else { status &= ~0xB0; /* Disable LOCK, mask out PRESL[01] */ status |= 0x10; /* 1.625 divisor for baud_base --> 921600 */ serial_out(up, 0x04, status); } return 1; } static inline int serial_index(struct uart_port *port) { return port->minor - 64; } |
| 10 10 10 10 10 10 10 10 10 10 12 12 12 12 19 19 12 12 12 19 19 19 19 19 19 19 19 19 19 19 19 19 12 19 19 1 1 1 19 19 19 19 19 19 19 19 19 19 19 19 1 1 1 1 19 19 2 19 19 12 7 19 19 19 19 19 19 10 10 10 10 10 10 10 10 12 12 12 12 12 12 12 12 12 12 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 | // SPDX-License-Identifier: GPL-2.0 /* * /proc/sys support */ #include <linux/init.h> #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/printk.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/module.h> #include <linux/bpf-cgroup.h> #include <linux/mount.h> #include <linux/kmemleak.h> #include "internal.h" #define list_for_each_table_entry(entry, header) \ entry = header->ctl_table; \ for (size_t i = 0 ; i < header->ctl_table_size && entry->procname; ++i, entry++) static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* Support for permanently empty directories */ static struct ctl_table sysctl_mount_point[] = { {.type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY } }; /** * register_sysctl_mount_point() - registers a sysctl mount point * @path: path for the mount point * * Used to create a permanently empty directory to serve as mount point. * There are some subtle but important permission checks this allows in the * case of unprivileged mounts. */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { return register_sysctl(path, sysctl_mount_point); } EXPORT_SYMBOL(register_sysctl_mount_point); #define sysctl_is_perm_empty_ctl_table(tptr) \ (tptr[0].type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) #define sysctl_is_perm_empty_ctl_header(hptr) \ (sysctl_is_perm_empty_ctl_table(hptr->ctl_table)) #define sysctl_set_perm_empty_ctl_header(hptr) \ (hptr->ctl_table[0].type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) #define sysctl_clear_perm_empty_ctl_header(hptr) \ (hptr->ctl_table[0].type = SYSCTL_TABLE_TYPE_DEFAULT) void proc_sys_poll_notify(struct ctl_table_poll *poll) { if (!poll) return; atomic_inc(&poll->event); wake_up_interruptible(&poll->wait); } static struct ctl_table root_table[] = { { .procname = "", .mode = S_IFDIR|S_IRUGO|S_IXUGO, }, }; static struct ctl_table_root sysctl_table_root = { .default_set.dir.header = { {{.count = 1, .nreg = 1, .ctl_table = root_table }}, .ctl_table_arg = root_table, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }, }; static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); static void sysctl_print_dir(struct ctl_dir *dir) { if (dir->header.parent) sysctl_print_dir(dir->header.parent); pr_cont("%s/", dir->header.ctl_table[0].procname); } static int namecmp(const char *name1, int len1, const char *name2, int len2) { int cmp; cmp = memcmp(name1, name2, min(len1, len2)); if (cmp == 0) cmp = len1 - len2; return cmp; } /* Called under sysctl_lock */ static struct ctl_table *find_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; struct rb_node *node = dir->root.rb_node; while (node) { struct ctl_node *ctl_node; const char *procname; int cmp; ctl_node = rb_entry(node, struct ctl_node, node); head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; procname = entry->procname; cmp = namecmp(name, namelen, procname, strlen(procname)); if (cmp < 0) node = node->rb_left; else if (cmp > 0) node = node->rb_right; else { *phead = head; return entry; } } return NULL; } static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; struct rb_node **p = &head->parent->root.rb_node; struct rb_node *parent = NULL; const char *name = entry->procname; int namelen = strlen(name); while (*p) { struct ctl_table_header *parent_head; struct ctl_table *parent_entry; struct ctl_node *parent_node; const char *parent_name; int cmp; parent = *p; parent_node = rb_entry(parent, struct ctl_node, node); parent_head = parent_node->header; parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; parent_name = parent_entry->procname; cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) p = &(*p)->rb_right; else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); pr_cont("%s\n", entry->procname); return -EEXIST; } } rb_link_node(node, parent, p); rb_insert_color(node, &head->parent->root); return 0; } static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) { struct rb_node *node = &head->node[entry - head->ctl_table].node; rb_erase(node, &head->parent->root); } static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, struct ctl_node *node, struct ctl_table *table, size_t table_size) { head->ctl_table = table; head->ctl_table_size = table_size; head->ctl_table_arg = table; head->used = 0; head->count = 1; head->nreg = 1; head->unregistering = NULL; head->root = root; head->set = set; head->parent = NULL; head->node = node; INIT_HLIST_HEAD(&head->inodes); if (node) { struct ctl_table *entry; list_for_each_table_entry(entry, head) { node->header = head; node++; } } } static void erase_header(struct ctl_table_header *head) { struct ctl_table *entry; list_for_each_table_entry(entry, head) erase_entry(head, entry); } static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { struct ctl_table *entry; struct ctl_table_header *dir_h = &dir->header; int err; /* Is this a permanently empty directory? */ if (sysctl_is_perm_empty_ctl_header(dir_h)) return -EROFS; /* Am I creating a permanently empty directory? */ if (header->ctl_table_size > 0 && sysctl_is_perm_empty_ctl_table(header->ctl_table)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; sysctl_set_perm_empty_ctl_header(dir_h); } dir_h->nreg++; header->parent = dir; err = insert_links(header); if (err) goto fail_links; list_for_each_table_entry(entry, header) { err = insert_entry(header, entry); if (err) goto fail; } return 0; fail: erase_header(header); put_links(header); fail_links: if (header->ctl_table == sysctl_mount_point) sysctl_clear_perm_empty_ctl_header(dir_h); header->parent = NULL; drop_sysctl_table(dir_h); return err; } /* called under sysctl_lock */ static int use_table(struct ctl_table_header *p) { if (unlikely(p->unregistering)) return 0; p->used++; return 1; } /* called under sysctl_lock */ static void unuse_table(struct ctl_table_header *p) { if (!--p->used) if (unlikely(p->unregistering)) complete(p->unregistering); } static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } /* called under sysctl_lock, will reacquire if has to wait */ static void start_unregistering(struct ctl_table_header *p) { /* * if p->used is 0, nobody will ever touch that entry again; * we'll eliminate all paths to it before dropping sysctl_lock */ if (unlikely(p->used)) { struct completion wait; init_completion(&wait); p->unregistering = &wait; spin_unlock(&sysctl_lock); wait_for_completion(&wait); } else { /* anything non-NULL; we'll never dereference it */ p->unregistering = ERR_PTR(-EINVAL); spin_unlock(&sysctl_lock); } /* * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. */ spin_lock(&sysctl_lock); erase_header(p); } static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) { BUG_ON(!head); spin_lock(&sysctl_lock); if (!use_table(head)) head = ERR_PTR(-ENOENT); spin_unlock(&sysctl_lock); return head; } static void sysctl_head_finish(struct ctl_table_header *head) { if (!head) return; spin_lock(&sysctl_lock); unuse_table(head); spin_unlock(&sysctl_lock); } static struct ctl_table_set * lookup_header_set(struct ctl_table_root *root) { struct ctl_table_set *set = &root->default_set; if (root->lookup) set = root->lookup(root); return set; } static struct ctl_table *lookup_entry(struct ctl_table_header **phead, struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; spin_lock(&sysctl_lock); entry = find_entry(&head, dir, name, namelen); if (entry && use_table(head)) *phead = head; else entry = NULL; spin_unlock(&sysctl_lock); return entry; } static struct ctl_node *first_usable_entry(struct rb_node *node) { struct ctl_node *ctl_node; for (;node; node = rb_next(node)) { ctl_node = rb_entry(node, struct ctl_node, node); if (use_table(ctl_node->header)) return ctl_node; } return NULL; } static void first_entry(struct ctl_dir *dir, struct ctl_table_header **phead, struct ctl_table **pentry) { struct ctl_table_header *head = NULL; struct ctl_table *entry = NULL; struct ctl_node *ctl_node; spin_lock(&sysctl_lock); ctl_node = first_usable_entry(rb_first(&dir->root)); spin_unlock(&sysctl_lock); if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) { struct ctl_table_header *head = *phead; struct ctl_table *entry = *pentry; struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; spin_lock(&sysctl_lock); unuse_table(head); ctl_node = first_usable_entry(rb_next(&ctl_node->node)); spin_unlock(&sysctl_lock); head = NULL; if (ctl_node) { head = ctl_node->header; entry = &head->ctl_table[ctl_node - head->node]; } *phead = head; *pentry = entry; } /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. */ static int test_perm(int mode, int op) { if (uid_eq(current_euid(), GLOBAL_ROOT_UID)) mode >>= 6; else if (in_egroup_p(GLOBAL_ROOT_GID)) mode >>= 3; if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; return -EACCES; } static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op) { struct ctl_table_root *root = head->root; int mode; if (root->permissions) mode = root->permissions(head, table); else mode = table->mode; return test_perm(mode, op); } static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { struct ctl_table_root *root = head->root; struct inode *inode; struct proc_inode *ei; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); ei = PROC_I(inode); spin_lock(&sysctl_lock); if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); simple_inode_init_ts(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; inode->i_fop = &proc_sys_file_operations; } else { inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; if (sysctl_is_perm_empty_ctl_header(head)) make_empty_dir_inode(inode); } if (root->set_ownership) root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); else { inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; } return inode; } void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); } static struct ctl_table_header *grab_header(struct inode *inode) { struct ctl_table_header *head = PROC_I(inode)->sysctl; if (!head) head = &sysctl_table_root.default_set.dir.header; return sysctl_head_grab(head); } static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ctl_table_header *head = grab_header(dir); struct ctl_table_header *h = NULL; const struct qstr *name = &dentry->d_name; struct ctl_table *p; struct inode *inode; struct dentry *err = ERR_PTR(-ENOENT); struct ctl_dir *ctl_dir; int ret; if (IS_ERR(head)) return ERR_CAST(head); ctl_dir = container_of(head, struct ctl_dir, header); p = lookup_entry(&h, ctl_dir, name->name, name->len); if (!p) goto out; if (S_ISLNK(p->mode)) { ret = sysctl_follow_link(&h, &p); err = ERR_PTR(ret); if (ret) goto out; } d_set_d_op(dentry, &proc_sys_dentry_operations); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); err = d_splice_alias(inode, dentry); out: if (h) sysctl_head_finish(h); sysctl_head_finish(head); return err; } static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, int write) { struct inode *inode = file_inode(iocb->ki_filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; size_t count = iov_iter_count(iter); char *kbuf; ssize_t error; if (IS_ERR(head)) return PTR_ERR(head); /* * At this point we know that the sysctl was not unregistered * and won't be until we finish. */ error = -EPERM; if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) goto out; /* if that can happen at all, it should be -EINVAL, not -EISDIR */ error = -EINVAL; if (!table->proc_handler) goto out; /* don't even try if the size is too large */ error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; if (write) { error = -EFAULT; if (!copy_from_iter_full(kbuf, count, iter)) goto out_free_buf; kbuf[count] = '\0'; } error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; /* careful: calling conventions are nasty here */ error = table->proc_handler(table, write, kbuf, &count, &iocb->ki_pos); if (error) goto out_free_buf; if (!write) { error = -EFAULT; if (copy_to_iter(kbuf, count, iter) < count) goto out_free_buf; } error = count; out_free_buf: kvfree(kbuf); out: sysctl_head_finish(head); return error; } static ssize_t proc_sys_read(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 0); } static ssize_t proc_sys_write(struct kiocb *iocb, struct iov_iter *iter) { return proc_sys_call_handler(iocb, iter, 1); } static int proc_sys_open(struct inode *inode, struct file *filp) { struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; /* sysctl was unregistered */ if (IS_ERR(head)) return PTR_ERR(head); if (table->poll) filp->private_data = proc_sys_poll_event(table->poll); sysctl_head_finish(head); return 0; } static __poll_t proc_sys_poll(struct file *filp, poll_table *wait) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; __poll_t ret = DEFAULT_POLLMASK; unsigned long event; /* sysctl was unregistered */ if (IS_ERR(head)) return EPOLLERR | EPOLLHUP; if (!table->proc_handler) goto out; if (!table->poll) goto out; event = (unsigned long)filp->private_data; poll_wait(filp, &table->poll->wait, wait); if (event != atomic_read(&table->poll->event)) { filp->private_data = proc_sys_poll_event(table->poll); ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; } out: sysctl_head_finish(head); return ret; } static bool proc_sys_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { struct dentry *child, *dir = file->f_path.dentry; struct inode *inode; struct qstr qname; ino_t ino = 0; unsigned type = DT_UNKNOWN; qname.name = table->procname; qname.len = strlen(table->procname); qname.hash = full_name_hash(dir, qname.name, qname.len); child = d_lookup(dir, &qname); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; if (d_in_lookup(child)) { struct dentry *res; d_set_d_op(child, &proc_sys_dentry_operations); inode = proc_sys_make_inode(dir->d_sb, head, table); res = d_splice_alias(inode, child); d_lookup_done(child); if (unlikely(res)) { if (IS_ERR(res)) { dput(child); return false; } dput(child); child = res; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); return dir_emit(ctx, qname.name, qname.len, ino, type); } static bool proc_sys_link_fill_cache(struct file *file, struct dir_context *ctx, struct ctl_table_header *head, struct ctl_table *table) { bool ret = true; head = sysctl_head_grab(head); if (IS_ERR(head)) return false; /* It is not an error if we can not follow the link ignore it */ if (sysctl_follow_link(&head, &table)) goto out; ret = proc_sys_fill_cache(file, ctx, head, table); out: sysctl_head_finish(head); return ret; } static int scan(struct ctl_table_header *head, struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { bool res; if ((*pos)++ < ctx->pos) return true; if (unlikely(S_ISLNK(table->mode))) res = proc_sys_link_fill_cache(file, ctx, head, table); else res = proc_sys_fill_cache(file, ctx, head, table); if (res) ctx->pos = *pos; return res; } static int proc_sys_readdir(struct file *file, struct dir_context *ctx) { struct ctl_table_header *head = grab_header(file_inode(file)); struct ctl_table_header *h = NULL; struct ctl_table *entry; struct ctl_dir *ctl_dir; unsigned long pos; if (IS_ERR(head)) return PTR_ERR(head); ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) goto out; pos = 2; for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { if (!scan(h, entry, &pos, file, ctx)) { sysctl_head_finish(h); break; } } out: sysctl_head_finish(head); return 0; } static int proc_sys_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { /* * sysctl entries that are not writeable, * are _NOT_ writeable, capabilities or not. */ struct ctl_table_header *head; struct ctl_table *table; int error; /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; head = grab_header(inode); if (IS_ERR(head)) return PTR_ERR(head); table = PROC_I(inode)->sysctl_entry; if (!table) /* global root - r-xr-xr-x */ error = mask & MAY_WRITE ? -EACCES : 0; else /* Use the permissions on the sysctl table entry */ error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); sysctl_head_finish(head); return error; } static int proc_sys_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, attr); return 0; } static int proc_sys_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; if (IS_ERR(head)) return PTR_ERR(head); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; sysctl_head_finish(head); return 0; } static const struct file_operations proc_sys_file_operations = { .open = proc_sys_open, .poll = proc_sys_poll, .read_iter = proc_sys_read, .write_iter = proc_sys_write, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .llseek = default_llseek, }; static const struct file_operations proc_sys_dir_file_operations = { .read = generic_read_dir, .iterate_shared = proc_sys_readdir, .llseek = generic_file_llseek, }; static const struct inode_operations proc_sys_inode_operations = { .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static const struct inode_operations proc_sys_dir_operations = { .lookup = proc_sys_lookup, .permission = proc_sys_permission, .setattr = proc_sys_setattr, .getattr = proc_sys_getattr, }; static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; return !PROC_I(d_inode(dentry))->sysctl->unregistering; } static int proc_sys_delete(const struct dentry *dentry) { return !!PROC_I(d_inode(dentry))->sysctl->unregistering; } static int sysctl_is_seen(struct ctl_table_header *p) { struct ctl_table_set *set = p->set; int res; spin_lock(&sysctl_lock); if (p->unregistering) res = 0; else if (!set->is_seen) res = 1; else res = set->is_seen(set); spin_unlock(&sysctl_lock); return res; } static int proc_sys_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct ctl_table_header *head; struct inode *inode; /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ /* AV: can it, indeed? */ inode = d_inode_rcu(dentry); if (!inode) return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; head = rcu_dereference(PROC_I(inode)->sysctl); return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { .d_revalidate = proc_sys_revalidate, .d_delete = proc_sys_delete, .d_compare = proc_sys_compare, }; static struct ctl_dir *find_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_header *head; struct ctl_table *entry; entry = find_entry(&head, dir, name, namelen); if (!entry) return ERR_PTR(-ENOENT); if (!S_ISDIR(entry->mode)) return ERR_PTR(-ENOTDIR); return container_of(head, struct ctl_dir, header); } static struct ctl_dir *new_dir(struct ctl_table_set *set, const char *name, int namelen) { struct ctl_table *table; struct ctl_dir *new; struct ctl_node *node; char *new_name; new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + sizeof(struct ctl_table)*2 + namelen + 1, GFP_KERNEL); if (!new) return NULL; node = (struct ctl_node *)(new + 1); table = (struct ctl_table *)(node + 1); new_name = (char *)(table + 2); memcpy(new_name, name, namelen); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; init_header(&new->header, set->dir.header.root, set, node, table, 1); return new; } /** * get_subdir - find or create a subdir with the specified name. * @dir: Directory to create the subdirectory in * @name: The name of the subdirectory to find or create * @namelen: The length of name * * Takes a directory with an elevated reference count so we know that * if we drop the lock the directory will not go away. Upon success * the reference is moved from @dir to the returned subdirectory. * Upon error an error code is returned and the reference on @dir is * simply dropped. */ static struct ctl_dir *get_subdir(struct ctl_dir *dir, const char *name, int namelen) { struct ctl_table_set *set = dir->header.set; struct ctl_dir *subdir, *new = NULL; int err; spin_lock(&sysctl_lock); subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; spin_unlock(&sysctl_lock); new = new_dir(set, name, namelen); spin_lock(&sysctl_lock); subdir = ERR_PTR(-ENOMEM); if (!new) goto failed; /* Was the subdir added while we dropped the lock? */ subdir = find_subdir(dir, name, namelen); if (!IS_ERR(subdir)) goto found; if (PTR_ERR(subdir) != -ENOENT) goto failed; /* Nope. Use the our freshly made directory entry. */ err = insert_header(dir, &new->header); subdir = ERR_PTR(err); if (err) goto failed; subdir = new; found: subdir->header.nreg++; failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); pr_cont("%*.*s %ld\n", namelen, namelen, name, PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) drop_sysctl_table(&new->header); spin_unlock(&sysctl_lock); return subdir; } static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) { struct ctl_dir *parent; const char *procname; if (!dir->header.parent) return &set->dir; parent = xlate_dir(set, dir->header.parent); if (IS_ERR(parent)) return parent; procname = dir->header.ctl_table[0].procname; return find_subdir(parent, procname, strlen(procname)); } static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_table **pentry) { struct ctl_table_header *head; struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_table *entry; struct ctl_dir *dir; int ret; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); dir = xlate_dir(set, (*phead)->parent); if (IS_ERR(dir)) ret = PTR_ERR(dir); else { const char *procname = (*pentry)->procname; head = NULL; entry = find_entry(&head, dir, procname, strlen(procname)); ret = -ENOENT; if (entry && use_table(head)) { unuse_table(*phead); *phead = head; *pentry = entry; ret = 0; } } spin_unlock(&sysctl_lock); return ret; } static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("sysctl table check failed: %s/%s %pV\n", path, table->procname, &vaf); va_end(args); return -EINVAL; } static int sysctl_check_table_array(const char *path, struct ctl_table *table) { int err = 0; if ((table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) err |= sysctl_err(path, table, "array not allowed"); } if (table->proc_handler == proc_dou8vec_minmax) { if (table->maxlen != sizeof(u8)) err |= sysctl_err(path, table, "array not allowed"); } if (table->proc_handler == proc_dobool) { if (table->maxlen != sizeof(bool)) err |= sysctl_err(path, table, "array not allowed"); } return err; } static int sysctl_check_table(const char *path, struct ctl_table_header *header) { struct ctl_table *entry; int err = 0; list_for_each_table_entry(entry, header) { if ((entry->proc_handler == proc_dostring) || (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || (entry->proc_handler == proc_douintvec) || (entry->proc_handler == proc_douintvec_minmax) || (entry->proc_handler == proc_dointvec_minmax) || (entry->proc_handler == proc_dou8vec_minmax) || (entry->proc_handler == proc_dointvec_jiffies) || (entry->proc_handler == proc_dointvec_userhz_jiffies) || (entry->proc_handler == proc_dointvec_ms_jiffies) || (entry->proc_handler == proc_doulongvec_minmax) || (entry->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { if (!entry->data) err |= sysctl_err(path, entry, "No data"); if (!entry->maxlen) err |= sysctl_err(path, entry, "No maxlen"); else err |= sysctl_check_table_array(path, entry); } if (!entry->proc_handler) err |= sysctl_err(path, entry, "No proc_handler"); if ((entry->mode & (S_IRUGO|S_IWUGO)) != entry->mode) err |= sysctl_err(path, entry, "bogus .mode 0%o", entry->mode); } return err; } static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { struct ctl_table *link_table, *entry, *link; struct ctl_table_header *links; struct ctl_node *node; char *link_name; int nr_entries, name_bytes; name_bytes = 0; nr_entries = 0; list_for_each_table_entry(entry, head) { nr_entries++; name_bytes += strlen(entry->procname) + 1; } links = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*nr_entries + sizeof(struct ctl_table)*(nr_entries + 1) + name_bytes, GFP_KERNEL); if (!links) return NULL; node = (struct ctl_node *)(links + 1); link_table = (struct ctl_table *)(node + nr_entries); link_name = (char *)&link_table[nr_entries + 1]; link = link_table; list_for_each_table_entry(entry, head) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; link->data = head->root; link_name += len; link++; } init_header(links, dir->header.root, dir->header.set, node, link_table, head->ctl_table_size); links->nreg = nr_entries; return links; } static bool get_links(struct ctl_dir *dir, struct ctl_table_header *header, struct ctl_table_root *link_root) { struct ctl_table_header *tmp_head; struct ctl_table *entry, *link; if (header->ctl_table_size == 0 || sysctl_is_perm_empty_ctl_table(header->ctl_table)) return true; /* Are there links available for every entry in table? */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; link = find_entry(&tmp_head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) continue; if (S_ISLNK(link->mode) && (link->data == link_root)) continue; return false; } /* The checks passed. Increase the registration count on the links */ list_for_each_table_entry(entry, header) { const char *procname = entry->procname; link = find_entry(&tmp_head, dir, procname, strlen(procname)); tmp_head->nreg++; } return true; } static int insert_links(struct ctl_table_header *head) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_dir *core_parent; struct ctl_table_header *links; int err; if (head->set == root_set) return 0; core_parent = xlate_dir(root_set, head->parent); if (IS_ERR(core_parent)) return 0; if (get_links(core_parent, head, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); links = new_links(core_parent, head); spin_lock(&sysctl_lock); err = -ENOMEM; if (!links) goto out; err = 0; if (get_links(core_parent, head, head->root)) { kfree(links); goto out; } err = insert_header(core_parent, links); if (err) kfree(links); out: drop_sysctl_table(&core_parent->header); return err; } /* Find the directory for the ctl_table. If one is not found create it. */ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) { const char *name, *nextname; for (name = path; name; name = nextname) { int namelen; nextname = strchr(name, '/'); if (nextname) { namelen = nextname - name; nextname++; } else { namelen = strlen(name); } if (namelen == 0) continue; /* * namelen ensures if name is "foo/bar/yay" only foo is * registered first. We traverse as if using mkdir -p and * return a ctl_dir for the last directory entry. */ dir = get_subdir(dir, name, namelen); if (IS_ERR(dir)) break; } return dir; } /** * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. * @table: the top-level table structure without any child. This table * should not be free'd after registration. So it should not be * used on stack. It can either be a global or dynamically allocated * by the caller and free'd later after sysctl unregistration. * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * The members of the &struct ctl_table structure are used as follows: * * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file * * data - a pointer to data for use by proc_handler * * maxlen - the maximum size in bytes of the data * * mode - the file permissions for the /proc/sys file * * child - must be %NULL. * * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines * XXX: we should eventually modify these to use long min / max [0] * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes (where child is not NULL) are not allowed, * sysctl_check_table() verifies this. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - * * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() * * It is the handler's job to read the input buffer from user memory * and process it. The handler should return 0 on success. * * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; struct ctl_dir *dir; struct ctl_node *node; header = kzalloc(sizeof(struct ctl_table_header) + sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT); if (!header) return NULL; node = (struct ctl_node *)(header + 1); init_header(header, root, set, node, table, table_size); if (sysctl_check_table(path, header)) goto fail; spin_lock(&sysctl_lock); dir = &set->dir; /* Reference moved down the directory tree get_subdir */ dir->header.nreg++; spin_unlock(&sysctl_lock); dir = sysctl_mkdir_p(dir, path); if (IS_ERR(dir)) goto fail; spin_lock(&sysctl_lock); if (insert_header(dir, header)) goto fail_put_dir_locked; drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); return header; fail_put_dir_locked: drop_sysctl_table(&dir->header); spin_unlock(&sysctl_lock); fail: kfree(header); return NULL; } /** * register_sysctl_sz - register a sysctl table * @path: The path to the directory the sysctl table is in. If the path * doesn't exist we will create it for you. * @table: the table structure. The calller must ensure the life of the @table * will be kept during the lifetime use of the syctl. It must not be freed * until unregister_sysctl_table() is called with the given returned table * with this registration. If your code is non modular then you don't need * to call unregister_sysctl_table() and can instead use something like * register_sysctl_init() which does not care for the result of the syctl * registration. * @table_size: The number of elements in table. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table, size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, path, table, table_size); } EXPORT_SYMBOL(register_sysctl_sz); /** * __register_sysctl_init() - register sysctl table to path * @path: path name for sysctl base. If that path doesn't exist we will create * it for you. * @table: This is the sysctl table that needs to be registered to the path. * The caller must ensure the life of the @table will be kept during the * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails * @table_size: The number of elements in table * * The sysctl interface is used by userspace to query or modify at runtime * a predefined value set on a variable. These variables however have default * values pre-set. Code which depends on these variables will always work even * if register_sysctl() fails. If register_sysctl() fails you'd just loose the * ability to query or modify the sysctls dynamically at run time. Chances of * register_sysctl() failing on init are extremely low, and so for both reasons * this function does not return any error as it is used by initialization code. * * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name, size_t table_size) { struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); if (unlikely(!hdr)) { pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path); return; } kmemleak_not_leak(hdr); } static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; struct ctl_table_root *root = header->root; struct ctl_dir *parent = header->parent; struct ctl_dir *core_parent; struct ctl_table *entry; if (header->set == root_set) return; core_parent = xlate_dir(root_set, parent); if (IS_ERR(core_parent)) return; list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; struct ctl_table *link; const char *name = entry->procname; link = find_entry(&link_head, core_parent, name, strlen(name)); if (link && ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || (S_ISLNK(link->mode) && (link->data == root)))) { drop_sysctl_table(link_head); } else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); pr_cont("%s\n", name); } } } static void drop_sysctl_table(struct ctl_table_header *header) { struct ctl_dir *parent = header->parent; if (--header->nreg) return; if (parent) { put_links(header); start_unregistering(header); } if (!--header->count) kfree_rcu(header, rcu); if (parent) drop_sysctl_table(&parent->header); } /** * unregister_sysctl_table - unregister a sysctl table hierarchy * @header: the header returned from register_sysctl or __register_sysctl_table * * Unregisters the sysctl table and all children. proc entries may not * actually be removed until they are no longer used by anyone. */ void unregister_sysctl_table(struct ctl_table_header * header) { might_sleep(); if (header == NULL) return; spin_lock(&sysctl_lock); drop_sysctl_table(header); spin_unlock(&sysctl_lock); } EXPORT_SYMBOL(unregister_sysctl_table); void setup_sysctl_set(struct ctl_table_set *set, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; init_header(&set->dir.header, root, set, NULL, root_table, 1); } void retire_sysctl_set(struct ctl_table_set *set) { WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); } int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return sysctl_init_bases(); } struct sysctl_alias { const char *kernel_param; const char *sysctl_param; }; /* * Historically some settings had both sysctl and a command line parameter. * With the generic sysctl. parameter support, we can handle them at a single * place and only keep the historical name for compatibility. This is not meant * to add brand new aliases. When adding existing aliases, consider whether * the possibly different moment of changing the value (e.g. from early_param * to the moment do_sysctl_args() is called) is an issue for the specific * parameter. */ static const struct sysctl_alias sysctl_aliases[] = { {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, {"hung_task_panic", "kernel.hung_task_panic" }, {"numa_zonelist_order", "vm.numa_zonelist_order" }, {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, { } }; static const char *sysctl_find_alias(char *param) { const struct sysctl_alias *alias; for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { if (strcmp(alias->kernel_param, param) == 0) return alias->sysctl_param; } return NULL; } bool sysctl_is_alias(char *param) { const char *alias = sysctl_find_alias(param); return alias != NULL; } /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) { char *path; struct vfsmount **proc_mnt = arg; struct file_system_type *proc_fs_type; struct file *file; int len; int err; loff_t pos = 0; ssize_t wret; if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { param += sizeof("sysctl") - 1; if (param[0] != '/' && param[0] != '.') return 0; param++; } else { param = (char *) sysctl_find_alias(param); if (!param) return 0; } if (!val) return -EINVAL; len = strlen(val); if (len == 0) return -EINVAL; /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no * options were given, we mount it only when the first sysctl option is * found. Why not a persistent mount? There are problems with a * persistent mount of proc in that it forces userspace not to use any * proc mount options. */ if (!*proc_mnt) { proc_fs_type = get_fs_type("proc"); if (!proc_fs_type) { pr_err("Failed to find procfs to set sysctl from command line\n"); return 0; } *proc_mnt = kern_mount(proc_fs_type); put_filesystem(proc_fs_type); if (IS_ERR(*proc_mnt)) { pr_err("Failed to mount procfs to set sysctl from command line\n"); return 0; } } path = kasprintf(GFP_KERNEL, "sys/%s", param); if (!path) panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", param, val); else if (err == -EACCES) pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", param, val); else pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", file, param, val); goto out; } wret = kernel_write(file, val, len, &pos); if (wret < 0) { err = wret; if (err == -EINVAL) pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", param, val); else pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", ERR_PTR(err), param, val); } else if (wret != len) { pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", wret, len, path, param, val); } err = filp_close(file, NULL); if (err) pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", ERR_PTR(err), param, val); out: kfree(path); return 0; } void do_sysctl_args(void) { char *command_line; struct vfsmount *proc_mnt = NULL; command_line = kstrdup(saved_command_line, GFP_KERNEL); if (!command_line) panic("%s: Failed to allocate copy of command line\n", __func__); parse_args("Setting sysctl args", command_line, NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); if (proc_mnt) kern_unmount(proc_mnt); kfree(command_line); } |
| 756 756 756 756 756 577 61 61 61 581 581 61 61 61 61 61 61 61 61 6 61 756 560 560 560 560 560 560 560 560 560 61 61 61 61 61 61 111 111 111 546 546 542 544 545 546 581 581 581 61 61 61 61 61 61 581 581 515 578 578 578 578 578 578 578 578 581 577 577 577 35 558 558 573 577 577 577 577 577 577 577 577 577 577 573 573 572 573 573 573 573 46 570 1 570 570 570 45 558 45 570 570 570 570 570 559 558 45 45 2 2 1 1 1 1 1 1 1 9 13 13 3 3 12 12 12 12 12 11 11 8 7 7 1 1 9 12 14 14 14 14 7 10 10 1 9 14 2 2 2 10 10 10 10 1 9 9 1 8 1 7 7 7 7 10 9 9 578 1 1 45 46 46 46 7 7 7 603 603 581 577 577 559 45 13 14 9 579 603 7 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 | // SPDX-License-Identifier: GPL-2.0 /* * USB Raw Gadget driver. * See Documentation/usb/raw-gadget.rst for more details. * * Copyright (c) 2020 Google, Inc. * Author: Andrey Konovalov <andreyknvl@gmail.com> */ #include <linux/compiler.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/idr.h> #include <linux/kref.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/semaphore.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/wait.h> #include <linux/usb.h> #include <linux/usb/ch9.h> #include <linux/usb/ch11.h> #include <linux/usb/gadget.h> #include <linux/usb/composite.h> #include <uapi/linux/usb/raw_gadget.h> #define DRIVER_DESC "USB Raw Gadget" #define DRIVER_NAME "raw-gadget" MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR("Andrey Konovalov"); MODULE_LICENSE("GPL"); /*----------------------------------------------------------------------*/ static DEFINE_IDA(driver_id_numbers); #define DRIVER_DRIVER_NAME_LENGTH_MAX 32 #define RAW_EVENT_QUEUE_SIZE 16 struct raw_event_queue { /* See the comment in raw_event_queue_fetch() for locking details. */ spinlock_t lock; struct semaphore sema; struct usb_raw_event *events[RAW_EVENT_QUEUE_SIZE]; int size; }; static void raw_event_queue_init(struct raw_event_queue *queue) { spin_lock_init(&queue->lock); sema_init(&queue->sema, 0); queue->size = 0; } static int raw_event_queue_add(struct raw_event_queue *queue, enum usb_raw_event_type type, size_t length, const void *data) { unsigned long flags; struct usb_raw_event *event; spin_lock_irqsave(&queue->lock, flags); if (queue->size >= RAW_EVENT_QUEUE_SIZE) { spin_unlock_irqrestore(&queue->lock, flags); return -ENOMEM; } event = kmalloc(sizeof(*event) + length, GFP_ATOMIC); if (!event) { spin_unlock_irqrestore(&queue->lock, flags); return -ENOMEM; } event->type = type; event->length = length; if (event->length) memcpy(&event->data[0], data, length); queue->events[queue->size] = event; queue->size++; up(&queue->sema); spin_unlock_irqrestore(&queue->lock, flags); return 0; } static struct usb_raw_event *raw_event_queue_fetch( struct raw_event_queue *queue) { int ret; unsigned long flags; struct usb_raw_event *event; /* * This function can be called concurrently. We first check that * there's at least one event queued by decrementing the semaphore, * and then take the lock to protect queue struct fields. */ ret = down_interruptible(&queue->sema); if (ret) return ERR_PTR(ret); spin_lock_irqsave(&queue->lock, flags); /* * queue->size must have the same value as queue->sema counter (before * the down_interruptible() call above), so this check is a fail-safe. */ if (WARN_ON(!queue->size)) { spin_unlock_irqrestore(&queue->lock, flags); return ERR_PTR(-ENODEV); } event = queue->events[0]; queue->size--; memmove(&queue->events[0], &queue->events[1], queue->size * sizeof(queue->events[0])); spin_unlock_irqrestore(&queue->lock, flags); return event; } static void raw_event_queue_destroy(struct raw_event_queue *queue) { int i; for (i = 0; i < queue->size; i++) kfree(queue->events[i]); queue->size = 0; } /*----------------------------------------------------------------------*/ struct raw_dev; enum ep_state { STATE_EP_DISABLED, STATE_EP_ENABLED, }; struct raw_ep { struct raw_dev *dev; enum ep_state state; struct usb_ep *ep; u8 addr; struct usb_request *req; bool urb_queued; bool disabling; ssize_t status; }; enum dev_state { STATE_DEV_INVALID = 0, STATE_DEV_OPENED, STATE_DEV_INITIALIZED, STATE_DEV_REGISTERING, STATE_DEV_RUNNING, STATE_DEV_CLOSED, STATE_DEV_FAILED }; struct raw_dev { struct kref count; spinlock_t lock; const char *udc_name; struct usb_gadget_driver driver; /* Reference to misc device: */ struct device *dev; /* Make driver names unique */ int driver_id_number; /* Protected by lock: */ enum dev_state state; bool gadget_registered; struct usb_gadget *gadget; struct usb_request *req; bool ep0_in_pending; bool ep0_out_pending; bool ep0_urb_queued; ssize_t ep0_status; struct raw_ep eps[USB_RAW_EPS_NUM_MAX]; int eps_num; struct completion ep0_done; struct raw_event_queue queue; }; static struct raw_dev *dev_new(void) { struct raw_dev *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; /* Matches kref_put() in raw_release(). */ kref_init(&dev->count); spin_lock_init(&dev->lock); init_completion(&dev->ep0_done); raw_event_queue_init(&dev->queue); dev->driver_id_number = -1; return dev; } static void dev_free(struct kref *kref) { struct raw_dev *dev = container_of(kref, struct raw_dev, count); int i; kfree(dev->udc_name); kfree(dev->driver.udc_name); kfree(dev->driver.driver.name); if (dev->driver_id_number >= 0) ida_free(&driver_id_numbers, dev->driver_id_number); if (dev->req) { if (dev->ep0_urb_queued) usb_ep_dequeue(dev->gadget->ep0, dev->req); usb_ep_free_request(dev->gadget->ep0, dev->req); } raw_event_queue_destroy(&dev->queue); for (i = 0; i < dev->eps_num; i++) { if (dev->eps[i].state == STATE_EP_DISABLED) continue; usb_ep_disable(dev->eps[i].ep); usb_ep_free_request(dev->eps[i].ep, dev->eps[i].req); kfree(dev->eps[i].ep->desc); dev->eps[i].state = STATE_EP_DISABLED; } kfree(dev); } /*----------------------------------------------------------------------*/ static int raw_queue_event(struct raw_dev *dev, enum usb_raw_event_type type, size_t length, const void *data) { int ret = 0; unsigned long flags; ret = raw_event_queue_add(&dev->queue, type, length, data); if (ret < 0) { spin_lock_irqsave(&dev->lock, flags); dev->state = STATE_DEV_FAILED; spin_unlock_irqrestore(&dev->lock, flags); } return ret; } static void gadget_ep0_complete(struct usb_ep *ep, struct usb_request *req) { struct raw_dev *dev = req->context; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (req->status) dev->ep0_status = req->status; else dev->ep0_status = req->actual; if (dev->ep0_in_pending) dev->ep0_in_pending = false; else dev->ep0_out_pending = false; spin_unlock_irqrestore(&dev->lock, flags); complete(&dev->ep0_done); } static u8 get_ep_addr(const char *name) { /* If the endpoint has fixed function (named as e.g. "ep12out-bulk"), * parse the endpoint address from its name. We deliberately use * deprecated simple_strtoul() function here, as the number isn't * followed by '\0' nor '\n'. */ if (isdigit(name[2])) return simple_strtoul(&name[2], NULL, 10); /* Otherwise the endpoint is configurable (named as e.g. "ep-a"). */ return USB_RAW_EP_ADDR_ANY; } static int gadget_bind(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { int ret = 0, i = 0; struct raw_dev *dev = container_of(driver, struct raw_dev, driver); struct usb_request *req; struct usb_ep *ep; unsigned long flags; if (strcmp(gadget->name, dev->udc_name) != 0) return -ENODEV; set_gadget_data(gadget, dev); req = usb_ep_alloc_request(gadget->ep0, GFP_KERNEL); if (!req) { dev_err(&gadget->dev, "usb_ep_alloc_request failed\n"); set_gadget_data(gadget, NULL); return -ENOMEM; } spin_lock_irqsave(&dev->lock, flags); dev->req = req; dev->req->context = dev; dev->req->complete = gadget_ep0_complete; dev->gadget = gadget; gadget_for_each_ep(ep, dev->gadget) { dev->eps[i].ep = ep; dev->eps[i].addr = get_ep_addr(ep->name); dev->eps[i].state = STATE_EP_DISABLED; i++; } dev->eps_num = i; spin_unlock_irqrestore(&dev->lock, flags); dev_dbg(&gadget->dev, "gadget connected\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_CONNECT, 0, NULL); if (ret < 0) { dev_err(&gadget->dev, "failed to queue connect event\n"); set_gadget_data(gadget, NULL); return ret; } /* Matches kref_put() in gadget_unbind(). */ kref_get(&dev->count); return ret; } static void gadget_unbind(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); set_gadget_data(gadget, NULL); /* Matches kref_get() in gadget_bind(). */ kref_put(&dev->count, dev_free); } static int gadget_setup(struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) { int ret = 0; struct raw_dev *dev = get_gadget_data(gadget); unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_err(&gadget->dev, "ignoring, device is not running\n"); ret = -ENODEV; goto out_unlock; } if (dev->ep0_in_pending || dev->ep0_out_pending) { dev_dbg(&gadget->dev, "stalling, request already pending\n"); ret = -EBUSY; goto out_unlock; } if ((ctrl->bRequestType & USB_DIR_IN) && ctrl->wLength) dev->ep0_in_pending = true; else dev->ep0_out_pending = true; spin_unlock_irqrestore(&dev->lock, flags); ret = raw_queue_event(dev, USB_RAW_EVENT_CONTROL, sizeof(*ctrl), ctrl); if (ret < 0) dev_err(&gadget->dev, "failed to queue control event\n"); goto out; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); out: if (ret == 0 && ctrl->wLength == 0) { /* * Return USB_GADGET_DELAYED_STATUS as a workaround to stop * some UDC drivers (e.g. dwc3) from automatically proceeding * with the status stage for 0-length transfers. * Should be removed once all UDC drivers are fixed to always * delay the status stage until a response is queued to EP0. */ return USB_GADGET_DELAYED_STATUS; } return ret; } static void gadget_disconnect(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); int ret; dev_dbg(&gadget->dev, "gadget disconnected\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_DISCONNECT, 0, NULL); if (ret < 0) dev_err(&gadget->dev, "failed to queue disconnect event\n"); } static void gadget_suspend(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); int ret; dev_dbg(&gadget->dev, "gadget suspended\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_SUSPEND, 0, NULL); if (ret < 0) dev_err(&gadget->dev, "failed to queue suspend event\n"); } static void gadget_resume(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); int ret; dev_dbg(&gadget->dev, "gadget resumed\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_RESUME, 0, NULL); if (ret < 0) dev_err(&gadget->dev, "failed to queue resume event\n"); } static void gadget_reset(struct usb_gadget *gadget) { struct raw_dev *dev = get_gadget_data(gadget); int ret; dev_dbg(&gadget->dev, "gadget reset\n"); ret = raw_queue_event(dev, USB_RAW_EVENT_RESET, 0, NULL); if (ret < 0) dev_err(&gadget->dev, "failed to queue reset event\n"); } /*----------------------------------------------------------------------*/ static struct miscdevice raw_misc_device; static int raw_open(struct inode *inode, struct file *fd) { struct raw_dev *dev; /* Nonblocking I/O is not supported yet. */ if (fd->f_flags & O_NONBLOCK) return -EINVAL; dev = dev_new(); if (!dev) return -ENOMEM; fd->private_data = dev; dev->state = STATE_DEV_OPENED; dev->dev = raw_misc_device.this_device; return 0; } static int raw_release(struct inode *inode, struct file *fd) { int ret = 0; struct raw_dev *dev = fd->private_data; unsigned long flags; bool unregister = false; spin_lock_irqsave(&dev->lock, flags); dev->state = STATE_DEV_CLOSED; if (!dev->gadget) { spin_unlock_irqrestore(&dev->lock, flags); goto out_put; } if (dev->gadget_registered) unregister = true; dev->gadget_registered = false; spin_unlock_irqrestore(&dev->lock, flags); if (unregister) { ret = usb_gadget_unregister_driver(&dev->driver); if (ret != 0) dev_err(dev->dev, "usb_gadget_unregister_driver() failed with %d\n", ret); /* Matches kref_get() in raw_ioctl_run(). */ kref_put(&dev->count, dev_free); } out_put: /* Matches dev_new() in raw_open(). */ kref_put(&dev->count, dev_free); return ret; } /*----------------------------------------------------------------------*/ static int raw_ioctl_init(struct raw_dev *dev, unsigned long value) { int ret = 0; int driver_id_number; struct usb_raw_init arg; char *udc_driver_name; char *udc_device_name; char *driver_driver_name; unsigned long flags; if (copy_from_user(&arg, (void __user *)value, sizeof(arg))) return -EFAULT; switch (arg.speed) { case USB_SPEED_UNKNOWN: arg.speed = USB_SPEED_HIGH; break; case USB_SPEED_LOW: case USB_SPEED_FULL: case USB_SPEED_HIGH: case USB_SPEED_SUPER: break; default: return -EINVAL; } driver_id_number = ida_alloc(&driver_id_numbers, GFP_KERNEL); if (driver_id_number < 0) return driver_id_number; driver_driver_name = kmalloc(DRIVER_DRIVER_NAME_LENGTH_MAX, GFP_KERNEL); if (!driver_driver_name) { ret = -ENOMEM; goto out_free_driver_id_number; } snprintf(driver_driver_name, DRIVER_DRIVER_NAME_LENGTH_MAX, DRIVER_NAME ".%d", driver_id_number); udc_driver_name = kmalloc(UDC_NAME_LENGTH_MAX, GFP_KERNEL); if (!udc_driver_name) { ret = -ENOMEM; goto out_free_driver_driver_name; } ret = strscpy(udc_driver_name, &arg.driver_name[0], UDC_NAME_LENGTH_MAX); if (ret < 0) goto out_free_udc_driver_name; ret = 0; udc_device_name = kmalloc(UDC_NAME_LENGTH_MAX, GFP_KERNEL); if (!udc_device_name) { ret = -ENOMEM; goto out_free_udc_driver_name; } ret = strscpy(udc_device_name, &arg.device_name[0], UDC_NAME_LENGTH_MAX); if (ret < 0) goto out_free_udc_device_name; ret = 0; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_OPENED) { dev_dbg(dev->dev, "fail, device is not opened\n"); ret = -EINVAL; goto out_unlock; } dev->udc_name = udc_driver_name; dev->driver.function = DRIVER_DESC; dev->driver.max_speed = arg.speed; dev->driver.setup = gadget_setup; dev->driver.disconnect = gadget_disconnect; dev->driver.bind = gadget_bind; dev->driver.unbind = gadget_unbind; dev->driver.suspend = gadget_suspend; dev->driver.resume = gadget_resume; dev->driver.reset = gadget_reset; dev->driver.driver.name = driver_driver_name; dev->driver.udc_name = udc_device_name; dev->driver.match_existing_only = 1; dev->driver_id_number = driver_id_number; dev->state = STATE_DEV_INITIALIZED; spin_unlock_irqrestore(&dev->lock, flags); return ret; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); out_free_udc_device_name: kfree(udc_device_name); out_free_udc_driver_name: kfree(udc_driver_name); out_free_driver_driver_name: kfree(driver_driver_name); out_free_driver_id_number: ida_free(&driver_id_numbers, driver_id_number); return ret; } static int raw_ioctl_run(struct raw_dev *dev, unsigned long value) { int ret = 0; unsigned long flags; if (value) return -EINVAL; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_INITIALIZED) { dev_dbg(dev->dev, "fail, device is not initialized\n"); ret = -EINVAL; goto out_unlock; } dev->state = STATE_DEV_REGISTERING; spin_unlock_irqrestore(&dev->lock, flags); ret = usb_gadget_register_driver(&dev->driver); spin_lock_irqsave(&dev->lock, flags); if (ret) { dev_err(dev->dev, "fail, usb_gadget_register_driver returned %d\n", ret); dev->state = STATE_DEV_FAILED; goto out_unlock; } dev->gadget_registered = true; dev->state = STATE_DEV_RUNNING; /* Matches kref_put() in raw_release(). */ kref_get(&dev->count); out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_event_fetch(struct raw_dev *dev, unsigned long value) { struct usb_raw_event arg; unsigned long flags; struct usb_raw_event *event; uint32_t length; if (copy_from_user(&arg, (void __user *)value, sizeof(arg))) return -EFAULT; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); spin_unlock_irqrestore(&dev->lock, flags); return -EINVAL; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); spin_unlock_irqrestore(&dev->lock, flags); return -EBUSY; } spin_unlock_irqrestore(&dev->lock, flags); event = raw_event_queue_fetch(&dev->queue); if (PTR_ERR(event) == -EINTR) { dev_dbg(&dev->gadget->dev, "event fetching interrupted\n"); return -EINTR; } if (IS_ERR(event)) { dev_err(&dev->gadget->dev, "failed to fetch event\n"); spin_lock_irqsave(&dev->lock, flags); dev->state = STATE_DEV_FAILED; spin_unlock_irqrestore(&dev->lock, flags); return -ENODEV; } length = min(arg.length, event->length); if (copy_to_user((void __user *)value, event, sizeof(*event) + length)) { kfree(event); return -EFAULT; } kfree(event); return 0; } static void *raw_alloc_io_data(struct usb_raw_ep_io *io, void __user *ptr, bool get_from_user) { void *data; if (copy_from_user(io, ptr, sizeof(*io))) return ERR_PTR(-EFAULT); if (io->ep >= USB_RAW_EPS_NUM_MAX) return ERR_PTR(-EINVAL); if (!usb_raw_io_flags_valid(io->flags)) return ERR_PTR(-EINVAL); if (io->length > PAGE_SIZE) return ERR_PTR(-EINVAL); if (get_from_user) data = memdup_user(ptr + sizeof(*io), io->length); else { data = kmalloc(io->length, GFP_KERNEL); if (!data) data = ERR_PTR(-ENOMEM); } return data; } static int raw_process_ep0_io(struct raw_dev *dev, struct usb_raw_ep_io *io, void *data, bool in) { int ret = 0; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (dev->ep0_urb_queued) { dev_dbg(&dev->gadget->dev, "fail, urb already queued\n"); ret = -EBUSY; goto out_unlock; } if ((in && !dev->ep0_in_pending) || (!in && !dev->ep0_out_pending)) { dev_dbg(&dev->gadget->dev, "fail, wrong direction\n"); ret = -EBUSY; goto out_unlock; } if (WARN_ON(in && dev->ep0_out_pending)) { ret = -ENODEV; dev->state = STATE_DEV_FAILED; goto out_unlock; } if (WARN_ON(!in && dev->ep0_in_pending)) { ret = -ENODEV; dev->state = STATE_DEV_FAILED; goto out_unlock; } dev->req->buf = data; dev->req->length = io->length; dev->req->zero = usb_raw_io_flags_zero(io->flags); dev->ep0_urb_queued = true; spin_unlock_irqrestore(&dev->lock, flags); ret = usb_ep_queue(dev->gadget->ep0, dev->req, GFP_KERNEL); if (ret) { dev_err(&dev->gadget->dev, "fail, usb_ep_queue returned %d\n", ret); spin_lock_irqsave(&dev->lock, flags); goto out_queue_failed; } ret = wait_for_completion_interruptible(&dev->ep0_done); if (ret) { dev_dbg(&dev->gadget->dev, "wait interrupted\n"); usb_ep_dequeue(dev->gadget->ep0, dev->req); wait_for_completion(&dev->ep0_done); spin_lock_irqsave(&dev->lock, flags); if (dev->ep0_status == -ECONNRESET) dev->ep0_status = -EINTR; goto out_interrupted; } spin_lock_irqsave(&dev->lock, flags); out_interrupted: ret = dev->ep0_status; out_queue_failed: dev->ep0_urb_queued = false; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep0_write(struct raw_dev *dev, unsigned long value) { int ret = 0; void *data; struct usb_raw_ep_io io; data = raw_alloc_io_data(&io, (void __user *)value, true); if (IS_ERR(data)) return PTR_ERR(data); ret = raw_process_ep0_io(dev, &io, data, true); kfree(data); return ret; } static int raw_ioctl_ep0_read(struct raw_dev *dev, unsigned long value) { int ret = 0; void *data; struct usb_raw_ep_io io; unsigned int length; data = raw_alloc_io_data(&io, (void __user *)value, false); if (IS_ERR(data)) return PTR_ERR(data); ret = raw_process_ep0_io(dev, &io, data, false); if (ret < 0) goto free; length = min(io.length, (unsigned int)ret); if (copy_to_user((void __user *)(value + sizeof(io)), data, length)) ret = -EFAULT; else ret = length; free: kfree(data); return ret; } static int raw_ioctl_ep0_stall(struct raw_dev *dev, unsigned long value) { int ret = 0; unsigned long flags; if (value) return -EINVAL; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (dev->ep0_urb_queued) { dev_dbg(&dev->gadget->dev, "fail, urb already queued\n"); ret = -EBUSY; goto out_unlock; } if (!dev->ep0_in_pending && !dev->ep0_out_pending) { dev_dbg(&dev->gadget->dev, "fail, no request pending\n"); ret = -EBUSY; goto out_unlock; } ret = usb_ep_set_halt(dev->gadget->ep0); if (ret < 0) dev_err(&dev->gadget->dev, "fail, usb_ep_set_halt returned %d\n", ret); if (dev->ep0_in_pending) dev->ep0_in_pending = false; else dev->ep0_out_pending = false; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep_enable(struct raw_dev *dev, unsigned long value) { int ret = 0, i; unsigned long flags; struct usb_endpoint_descriptor *desc; struct raw_ep *ep; bool ep_props_matched = false; desc = memdup_user((void __user *)value, sizeof(*desc)); if (IS_ERR(desc)) return PTR_ERR(desc); /* * Endpoints with a maxpacket length of 0 can cause crashes in UDC * drivers. */ if (usb_endpoint_maxp(desc) == 0) { dev_dbg(dev->dev, "fail, bad endpoint maxpacket\n"); kfree(desc); return -EINVAL; } spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_free; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_free; } for (i = 0; i < dev->eps_num; i++) { ep = &dev->eps[i]; if (ep->addr != usb_endpoint_num(desc) && ep->addr != USB_RAW_EP_ADDR_ANY) continue; if (!usb_gadget_ep_match_desc(dev->gadget, ep->ep, desc, NULL)) continue; ep_props_matched = true; if (ep->state != STATE_EP_DISABLED) continue; ep->ep->desc = desc; ret = usb_ep_enable(ep->ep); if (ret < 0) { dev_err(&dev->gadget->dev, "fail, usb_ep_enable returned %d\n", ret); goto out_free; } ep->req = usb_ep_alloc_request(ep->ep, GFP_ATOMIC); if (!ep->req) { dev_err(&dev->gadget->dev, "fail, usb_ep_alloc_request failed\n"); usb_ep_disable(ep->ep); ret = -ENOMEM; goto out_free; } ep->state = STATE_EP_ENABLED; ep->ep->driver_data = ep; ret = i; goto out_unlock; } if (!ep_props_matched) { dev_dbg(&dev->gadget->dev, "fail, bad endpoint descriptor\n"); ret = -EINVAL; } else { dev_dbg(&dev->gadget->dev, "fail, no endpoints available\n"); ret = -EBUSY; } out_free: kfree(desc); out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep_disable(struct raw_dev *dev, unsigned long value) { int ret = 0, i = value; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (i < 0 || i >= dev->eps_num) { dev_dbg(dev->dev, "fail, invalid endpoint\n"); ret = -EBUSY; goto out_unlock; } if (dev->eps[i].state == STATE_EP_DISABLED) { dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n"); ret = -EINVAL; goto out_unlock; } if (dev->eps[i].disabling) { dev_dbg(&dev->gadget->dev, "fail, disable already in progress\n"); ret = -EINVAL; goto out_unlock; } if (dev->eps[i].urb_queued) { dev_dbg(&dev->gadget->dev, "fail, waiting for urb completion\n"); ret = -EINVAL; goto out_unlock; } dev->eps[i].disabling = true; spin_unlock_irqrestore(&dev->lock, flags); usb_ep_disable(dev->eps[i].ep); spin_lock_irqsave(&dev->lock, flags); usb_ep_free_request(dev->eps[i].ep, dev->eps[i].req); kfree(dev->eps[i].ep->desc); dev->eps[i].state = STATE_EP_DISABLED; dev->eps[i].disabling = false; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep_set_clear_halt_wedge(struct raw_dev *dev, unsigned long value, bool set, bool halt) { int ret = 0, i = value; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (i < 0 || i >= dev->eps_num) { dev_dbg(dev->dev, "fail, invalid endpoint\n"); ret = -EBUSY; goto out_unlock; } if (dev->eps[i].state == STATE_EP_DISABLED) { dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n"); ret = -EINVAL; goto out_unlock; } if (dev->eps[i].disabling) { dev_dbg(&dev->gadget->dev, "fail, disable is in progress\n"); ret = -EINVAL; goto out_unlock; } if (dev->eps[i].urb_queued) { dev_dbg(&dev->gadget->dev, "fail, waiting for urb completion\n"); ret = -EINVAL; goto out_unlock; } if (usb_endpoint_xfer_isoc(dev->eps[i].ep->desc)) { dev_dbg(&dev->gadget->dev, "fail, can't halt/wedge ISO endpoint\n"); ret = -EINVAL; goto out_unlock; } if (set && halt) { ret = usb_ep_set_halt(dev->eps[i].ep); if (ret < 0) dev_err(&dev->gadget->dev, "fail, usb_ep_set_halt returned %d\n", ret); } else if (!set && halt) { ret = usb_ep_clear_halt(dev->eps[i].ep); if (ret < 0) dev_err(&dev->gadget->dev, "fail, usb_ep_clear_halt returned %d\n", ret); } else if (set && !halt) { ret = usb_ep_set_wedge(dev->eps[i].ep); if (ret < 0) dev_err(&dev->gadget->dev, "fail, usb_ep_set_wedge returned %d\n", ret); } out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static void gadget_ep_complete(struct usb_ep *ep, struct usb_request *req) { struct raw_ep *r_ep = (struct raw_ep *)ep->driver_data; struct raw_dev *dev = r_ep->dev; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (req->status) r_ep->status = req->status; else r_ep->status = req->actual; spin_unlock_irqrestore(&dev->lock, flags); complete((struct completion *)req->context); } static int raw_process_ep_io(struct raw_dev *dev, struct usb_raw_ep_io *io, void *data, bool in) { int ret = 0; unsigned long flags; struct raw_ep *ep; DECLARE_COMPLETION_ONSTACK(done); spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } if (io->ep >= dev->eps_num) { dev_dbg(&dev->gadget->dev, "fail, invalid endpoint\n"); ret = -EINVAL; goto out_unlock; } ep = &dev->eps[io->ep]; if (ep->state != STATE_EP_ENABLED) { dev_dbg(&dev->gadget->dev, "fail, endpoint is not enabled\n"); ret = -EBUSY; goto out_unlock; } if (ep->disabling) { dev_dbg(&dev->gadget->dev, "fail, endpoint is already being disabled\n"); ret = -EBUSY; goto out_unlock; } if (ep->urb_queued) { dev_dbg(&dev->gadget->dev, "fail, urb already queued\n"); ret = -EBUSY; goto out_unlock; } if (in != usb_endpoint_dir_in(ep->ep->desc)) { dev_dbg(&dev->gadget->dev, "fail, wrong direction\n"); ret = -EINVAL; goto out_unlock; } ep->dev = dev; ep->req->context = &done; ep->req->complete = gadget_ep_complete; ep->req->buf = data; ep->req->length = io->length; ep->req->zero = usb_raw_io_flags_zero(io->flags); ep->urb_queued = true; spin_unlock_irqrestore(&dev->lock, flags); ret = usb_ep_queue(ep->ep, ep->req, GFP_KERNEL); if (ret) { dev_err(&dev->gadget->dev, "fail, usb_ep_queue returned %d\n", ret); spin_lock_irqsave(&dev->lock, flags); goto out_queue_failed; } ret = wait_for_completion_interruptible(&done); if (ret) { dev_dbg(&dev->gadget->dev, "wait interrupted\n"); usb_ep_dequeue(ep->ep, ep->req); wait_for_completion(&done); spin_lock_irqsave(&dev->lock, flags); if (ep->status == -ECONNRESET) ep->status = -EINTR; goto out_interrupted; } spin_lock_irqsave(&dev->lock, flags); out_interrupted: ret = ep->status; out_queue_failed: ep->urb_queued = false; out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_ep_write(struct raw_dev *dev, unsigned long value) { int ret = 0; char *data; struct usb_raw_ep_io io; data = raw_alloc_io_data(&io, (void __user *)value, true); if (IS_ERR(data)) return PTR_ERR(data); ret = raw_process_ep_io(dev, &io, data, true); kfree(data); return ret; } static int raw_ioctl_ep_read(struct raw_dev *dev, unsigned long value) { int ret = 0; char *data; struct usb_raw_ep_io io; unsigned int length; data = raw_alloc_io_data(&io, (void __user *)value, false); if (IS_ERR(data)) return PTR_ERR(data); ret = raw_process_ep_io(dev, &io, data, false); if (ret < 0) goto free; length = min(io.length, (unsigned int)ret); if (copy_to_user((void __user *)(value + sizeof(io)), data, length)) ret = -EFAULT; else ret = length; free: kfree(data); return ret; } static int raw_ioctl_configure(struct raw_dev *dev, unsigned long value) { int ret = 0; unsigned long flags; if (value) return -EINVAL; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } usb_gadget_set_state(dev->gadget, USB_STATE_CONFIGURED); out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static int raw_ioctl_vbus_draw(struct raw_dev *dev, unsigned long value) { int ret = 0; unsigned long flags; spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; goto out_unlock; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; goto out_unlock; } usb_gadget_vbus_draw(dev->gadget, 2 * value); out_unlock: spin_unlock_irqrestore(&dev->lock, flags); return ret; } static void fill_ep_caps(struct usb_ep_caps *caps, struct usb_raw_ep_caps *raw_caps) { raw_caps->type_control = caps->type_control; raw_caps->type_iso = caps->type_iso; raw_caps->type_bulk = caps->type_bulk; raw_caps->type_int = caps->type_int; raw_caps->dir_in = caps->dir_in; raw_caps->dir_out = caps->dir_out; } static void fill_ep_limits(struct usb_ep *ep, struct usb_raw_ep_limits *limits) { limits->maxpacket_limit = ep->maxpacket_limit; limits->max_streams = ep->max_streams; } static int raw_ioctl_eps_info(struct raw_dev *dev, unsigned long value) { int ret = 0, i; unsigned long flags; struct usb_raw_eps_info *info; struct raw_ep *ep; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { ret = -ENOMEM; goto out; } spin_lock_irqsave(&dev->lock, flags); if (dev->state != STATE_DEV_RUNNING) { dev_dbg(dev->dev, "fail, device is not running\n"); ret = -EINVAL; spin_unlock_irqrestore(&dev->lock, flags); goto out_free; } if (!dev->gadget) { dev_dbg(dev->dev, "fail, gadget is not bound\n"); ret = -EBUSY; spin_unlock_irqrestore(&dev->lock, flags); goto out_free; } for (i = 0; i < dev->eps_num; i++) { ep = &dev->eps[i]; strscpy(&info->eps[i].name[0], ep->ep->name, USB_RAW_EP_NAME_MAX); info->eps[i].addr = ep->addr; fill_ep_caps(&ep->ep->caps, &info->eps[i].caps); fill_ep_limits(ep->ep, &info->eps[i].limits); } ret = dev->eps_num; spin_unlock_irqrestore(&dev->lock, flags); if (copy_to_user((void __user *)value, info, sizeof(*info))) ret = -EFAULT; out_free: kfree(info); out: return ret; } static long raw_ioctl(struct file *fd, unsigned int cmd, unsigned long value) { struct raw_dev *dev = fd->private_data; int ret = 0; if (!dev) return -EBUSY; switch (cmd) { case USB_RAW_IOCTL_INIT: ret = raw_ioctl_init(dev, value); break; case USB_RAW_IOCTL_RUN: ret = raw_ioctl_run(dev, value); break; case USB_RAW_IOCTL_EVENT_FETCH: ret = raw_ioctl_event_fetch(dev, value); break; case USB_RAW_IOCTL_EP0_WRITE: ret = raw_ioctl_ep0_write(dev, value); break; case USB_RAW_IOCTL_EP0_READ: ret = raw_ioctl_ep0_read(dev, value); break; case USB_RAW_IOCTL_EP_ENABLE: ret = raw_ioctl_ep_enable(dev, value); break; case USB_RAW_IOCTL_EP_DISABLE: ret = raw_ioctl_ep_disable(dev, value); break; case USB_RAW_IOCTL_EP_WRITE: ret = raw_ioctl_ep_write(dev, value); break; case USB_RAW_IOCTL_EP_READ: ret = raw_ioctl_ep_read(dev, value); break; case USB_RAW_IOCTL_CONFIGURE: ret = raw_ioctl_configure(dev, value); break; case USB_RAW_IOCTL_VBUS_DRAW: ret = raw_ioctl_vbus_draw(dev, value); break; case USB_RAW_IOCTL_EPS_INFO: ret = raw_ioctl_eps_info(dev, value); break; case USB_RAW_IOCTL_EP0_STALL: ret = raw_ioctl_ep0_stall(dev, value); break; case USB_RAW_IOCTL_EP_SET_HALT: ret = raw_ioctl_ep_set_clear_halt_wedge( dev, value, true, true); break; case USB_RAW_IOCTL_EP_CLEAR_HALT: ret = raw_ioctl_ep_set_clear_halt_wedge( dev, value, false, true); break; case USB_RAW_IOCTL_EP_SET_WEDGE: ret = raw_ioctl_ep_set_clear_halt_wedge( dev, value, true, false); break; default: ret = -EINVAL; } return ret; } /*----------------------------------------------------------------------*/ static const struct file_operations raw_fops = { .open = raw_open, .unlocked_ioctl = raw_ioctl, .compat_ioctl = raw_ioctl, .release = raw_release, .llseek = no_llseek, }; static struct miscdevice raw_misc_device = { .minor = MISC_DYNAMIC_MINOR, .name = DRIVER_NAME, .fops = &raw_fops, }; module_misc_device(raw_misc_device); |
| 5 5 5 5 5 5 5 5 5 5 5 4 4 4 4 3 3 3 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 87 1 1 87 41 41 41 36 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Routines for driver control interface * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/threads.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/math64.h> #include <linux/sched/signal.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <sound/control.h> // Max allocation size for user controls. static int max_user_ctl_alloc_size = 8 * 1024 * 1024; module_param_named(max_user_ctl_alloc_size, max_user_ctl_alloc_size, int, 0444); MODULE_PARM_DESC(max_user_ctl_alloc_size, "Max allocation size for user controls"); #define MAX_CONTROL_COUNT 1028 struct snd_kctl_ioctl { struct list_head list; /* list of all ioctls */ snd_kctl_ioctl_func_t fioctl; }; static DECLARE_RWSEM(snd_ioctl_rwsem); static DECLARE_RWSEM(snd_ctl_layer_rwsem); static LIST_HEAD(snd_control_ioctls); #ifdef CONFIG_COMPAT static LIST_HEAD(snd_control_compat_ioctls); #endif static struct snd_ctl_layer_ops *snd_ctl_layer; static int snd_ctl_remove_locked(struct snd_card *card, struct snd_kcontrol *kcontrol); static int snd_ctl_open(struct inode *inode, struct file *file) { unsigned long flags; struct snd_card *card; struct snd_ctl_file *ctl; int i, err; err = stream_open(inode, file); if (err < 0) return err; card = snd_lookup_minor_data(iminor(inode), SNDRV_DEVICE_TYPE_CONTROL); if (!card) { err = -ENODEV; goto __error1; } err = snd_card_file_add(card, file); if (err < 0) { err = -ENODEV; goto __error1; } if (!try_module_get(card->module)) { err = -EFAULT; goto __error2; } ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); if (ctl == NULL) { err = -ENOMEM; goto __error; } INIT_LIST_HEAD(&ctl->events); init_waitqueue_head(&ctl->change_sleep); spin_lock_init(&ctl->read_lock); ctl->card = card; for (i = 0; i < SND_CTL_SUBDEV_ITEMS; i++) ctl->preferred_subdevice[i] = -1; ctl->pid = get_pid(task_pid(current)); file->private_data = ctl; write_lock_irqsave(&card->ctl_files_rwlock, flags); list_add_tail(&ctl->list, &card->ctl_files); write_unlock_irqrestore(&card->ctl_files_rwlock, flags); snd_card_unref(card); return 0; __error: module_put(card->module); __error2: snd_card_file_remove(card, file); __error1: if (card) snd_card_unref(card); return err; } static void snd_ctl_empty_read_queue(struct snd_ctl_file * ctl) { unsigned long flags; struct snd_kctl_event *cread; spin_lock_irqsave(&ctl->read_lock, flags); while (!list_empty(&ctl->events)) { cread = snd_kctl_event(ctl->events.next); list_del(&cread->list); kfree(cread); } spin_unlock_irqrestore(&ctl->read_lock, flags); } static int snd_ctl_release(struct inode *inode, struct file *file) { unsigned long flags; struct snd_card *card; struct snd_ctl_file *ctl; struct snd_kcontrol *control; unsigned int idx; ctl = file->private_data; file->private_data = NULL; card = ctl->card; write_lock_irqsave(&card->ctl_files_rwlock, flags); list_del(&ctl->list); write_unlock_irqrestore(&card->ctl_files_rwlock, flags); down_write(&card->controls_rwsem); list_for_each_entry(control, &card->controls, list) for (idx = 0; idx < control->count; idx++) if (control->vd[idx].owner == ctl) control->vd[idx].owner = NULL; up_write(&card->controls_rwsem); snd_fasync_free(ctl->fasync); snd_ctl_empty_read_queue(ctl); put_pid(ctl->pid); kfree(ctl); module_put(card->module); snd_card_file_remove(card, file); return 0; } /** * snd_ctl_notify - Send notification to user-space for a control change * @card: the card to send notification * @mask: the event mask, SNDRV_CTL_EVENT_* * @id: the ctl element id to send notification * * This function adds an event record with the given id and mask, appends * to the list and wakes up the user-space for notification. This can be * called in the atomic context. */ void snd_ctl_notify(struct snd_card *card, unsigned int mask, struct snd_ctl_elem_id *id) { unsigned long flags; struct snd_ctl_file *ctl; struct snd_kctl_event *ev; if (snd_BUG_ON(!card || !id)) return; if (card->shutdown) return; read_lock_irqsave(&card->ctl_files_rwlock, flags); #if IS_ENABLED(CONFIG_SND_MIXER_OSS) card->mixer_oss_change_count++; #endif list_for_each_entry(ctl, &card->ctl_files, list) { if (!ctl->subscribed) continue; spin_lock(&ctl->read_lock); list_for_each_entry(ev, &ctl->events, list) { if (ev->id.numid == id->numid) { ev->mask |= mask; goto _found; } } ev = kzalloc(sizeof(*ev), GFP_ATOMIC); if (ev) { ev->id = *id; ev->mask = mask; list_add_tail(&ev->list, &ctl->events); } else { dev_err(card->dev, "No memory available to allocate event\n"); } _found: wake_up(&ctl->change_sleep); spin_unlock(&ctl->read_lock); snd_kill_fasync(ctl->fasync, SIGIO, POLL_IN); } read_unlock_irqrestore(&card->ctl_files_rwlock, flags); } EXPORT_SYMBOL(snd_ctl_notify); /** * snd_ctl_notify_one - Send notification to user-space for a control change * @card: the card to send notification * @mask: the event mask, SNDRV_CTL_EVENT_* * @kctl: the pointer with the control instance * @ioff: the additional offset to the control index * * This function calls snd_ctl_notify() and does additional jobs * like LED state changes. */ void snd_ctl_notify_one(struct snd_card *card, unsigned int mask, struct snd_kcontrol *kctl, unsigned int ioff) { struct snd_ctl_elem_id id = kctl->id; struct snd_ctl_layer_ops *lops; id.index += ioff; id.numid += ioff; snd_ctl_notify(card, mask, &id); down_read(&snd_ctl_layer_rwsem); for (lops = snd_ctl_layer; lops; lops = lops->next) lops->lnotify(card, mask, kctl, ioff); up_read(&snd_ctl_layer_rwsem); } EXPORT_SYMBOL(snd_ctl_notify_one); /** * snd_ctl_new - create a new control instance with some elements * @kctl: the pointer to store new control instance * @count: the number of elements in this control * @access: the default access flags for elements in this control * @file: given when locking these elements * * Allocates a memory object for a new control instance. The instance has * elements as many as the given number (@count). Each element has given * access permissions (@access). Each element is locked when @file is given. * * Return: 0 on success, error code on failure */ static int snd_ctl_new(struct snd_kcontrol **kctl, unsigned int count, unsigned int access, struct snd_ctl_file *file) { unsigned int idx; if (count == 0 || count > MAX_CONTROL_COUNT) return -EINVAL; *kctl = kzalloc(struct_size(*kctl, vd, count), GFP_KERNEL); if (!*kctl) return -ENOMEM; for (idx = 0; idx < count; idx++) { (*kctl)->vd[idx].access = access; (*kctl)->vd[idx].owner = file; } (*kctl)->count = count; return 0; } /** * snd_ctl_new1 - create a control instance from the template * @ncontrol: the initialization record * @private_data: the private data to set * * Allocates a new struct snd_kcontrol instance and initialize from the given * template. When the access field of ncontrol is 0, it's assumed as * READWRITE access. When the count field is 0, it's assumes as one. * * Return: The pointer of the newly generated instance, or %NULL on failure. */ struct snd_kcontrol *snd_ctl_new1(const struct snd_kcontrol_new *ncontrol, void *private_data) { struct snd_kcontrol *kctl; unsigned int count; unsigned int access; int err; if (snd_BUG_ON(!ncontrol || !ncontrol->info)) return NULL; count = ncontrol->count; if (count == 0) count = 1; access = ncontrol->access; if (access == 0) access = SNDRV_CTL_ELEM_ACCESS_READWRITE; access &= (SNDRV_CTL_ELEM_ACCESS_READWRITE | SNDRV_CTL_ELEM_ACCESS_VOLATILE | SNDRV_CTL_ELEM_ACCESS_INACTIVE | SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE | SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND | SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK | SNDRV_CTL_ELEM_ACCESS_LED_MASK | SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK); err = snd_ctl_new(&kctl, count, access, NULL); if (err < 0) return NULL; /* The 'numid' member is decided when calling snd_ctl_add(). */ kctl->id.iface = ncontrol->iface; kctl->id.device = ncontrol->device; kctl->id.subdevice = ncontrol->subdevice; if (ncontrol->name) { strscpy(kctl->id.name, ncontrol->name, sizeof(kctl->id.name)); if (strcmp(ncontrol->name, kctl->id.name) != 0) pr_warn("ALSA: Control name '%s' truncated to '%s'\n", ncontrol->name, kctl->id.name); } kctl->id.index = ncontrol->index; kctl->info = ncontrol->info; kctl->get = ncontrol->get; kctl->put = ncontrol->put; kctl->tlv.p = ncontrol->tlv.p; kctl->private_value = ncontrol->private_value; kctl->private_data = private_data; return kctl; } EXPORT_SYMBOL(snd_ctl_new1); /** * snd_ctl_free_one - release the control instance * @kcontrol: the control instance * * Releases the control instance created via snd_ctl_new() * or snd_ctl_new1(). * Don't call this after the control was added to the card. */ void snd_ctl_free_one(struct snd_kcontrol *kcontrol) { if (kcontrol) { if (kcontrol->private_free) kcontrol->private_free(kcontrol); kfree(kcontrol); } } EXPORT_SYMBOL(snd_ctl_free_one); static bool snd_ctl_remove_numid_conflict(struct snd_card *card, unsigned int count) { struct snd_kcontrol *kctl; /* Make sure that the ids assigned to the control do not wrap around */ if (card->last_numid >= UINT_MAX - count) card->last_numid = 0; list_for_each_entry(kctl, &card->controls, list) { if (kctl->id.numid < card->last_numid + 1 + count && kctl->id.numid + kctl->count > card->last_numid + 1) { card->last_numid = kctl->id.numid + kctl->count - 1; return true; } } return false; } static int snd_ctl_find_hole(struct snd_card *card, unsigned int count) { unsigned int iter = 100000; while (snd_ctl_remove_numid_conflict(card, count)) { if (--iter == 0) { /* this situation is very unlikely */ dev_err(card->dev, "unable to allocate new control numid\n"); return -ENOMEM; } } return 0; } /* check whether the given id is contained in the given kctl */ static bool elem_id_matches(const struct snd_kcontrol *kctl, const struct snd_ctl_elem_id *id) { return kctl->id.iface == id->iface && kctl->id.device == id->device && kctl->id.subdevice == id->subdevice && !strncmp(kctl->id.name, id->name, sizeof(kctl->id.name)) && kctl->id.index <= id->index && kctl->id.index + kctl->count > id->index; } #ifdef CONFIG_SND_CTL_FAST_LOOKUP /* Compute a hash key for the corresponding ctl id * It's for the name lookup, hence the numid is excluded. * The hash key is bound in LONG_MAX to be used for Xarray key. */ #define MULTIPLIER 37 static unsigned long get_ctl_id_hash(const struct snd_ctl_elem_id *id) { int i; unsigned long h; h = id->iface; h = MULTIPLIER * h + id->device; h = MULTIPLIER * h + id->subdevice; for (i = 0; i < SNDRV_CTL_ELEM_ID_NAME_MAXLEN && id->name[i]; i++) h = MULTIPLIER * h + id->name[i]; h = MULTIPLIER * h + id->index; h &= LONG_MAX; return h; } /* add hash entries to numid and ctl xarray tables */ static void add_hash_entries(struct snd_card *card, struct snd_kcontrol *kcontrol) { struct snd_ctl_elem_id id = kcontrol->id; int i; xa_store_range(&card->ctl_numids, kcontrol->id.numid, kcontrol->id.numid + kcontrol->count - 1, kcontrol, GFP_KERNEL); for (i = 0; i < kcontrol->count; i++) { id.index = kcontrol->id.index + i; if (xa_insert(&card->ctl_hash, get_ctl_id_hash(&id), kcontrol, GFP_KERNEL)) { /* skip hash for this entry, noting we had collision */ card->ctl_hash_collision = true; dev_dbg(card->dev, "ctl_hash collision %d:%s:%d\n", id.iface, id.name, id.index); } } } /* remove hash entries that have been added */ static void remove_hash_entries(struct snd_card *card, struct snd_kcontrol *kcontrol) { struct snd_ctl_elem_id id = kcontrol->id; struct snd_kcontrol *matched; unsigned long h; int i; for (i = 0; i < kcontrol->count; i++) { xa_erase(&card->ctl_numids, id.numid); h = get_ctl_id_hash(&id); matched = xa_load(&card->ctl_hash, h); if (matched && (matched == kcontrol || elem_id_matches(matched, &id))) xa_erase(&card->ctl_hash, h); id.index++; id.numid++; } } #else /* CONFIG_SND_CTL_FAST_LOOKUP */ static inline void add_hash_entries(struct snd_card *card, struct snd_kcontrol *kcontrol) { } static inline void remove_hash_entries(struct snd_card *card, struct snd_kcontrol *kcontrol) { } #endif /* CONFIG_SND_CTL_FAST_LOOKUP */ enum snd_ctl_add_mode { CTL_ADD_EXCLUSIVE, CTL_REPLACE, CTL_ADD_ON_REPLACE, }; /* add/replace a new kcontrol object; call with card->controls_rwsem locked */ static int __snd_ctl_add_replace(struct snd_card *card, struct snd_kcontrol *kcontrol, enum snd_ctl_add_mode mode) { struct snd_ctl_elem_id id; unsigned int idx; struct snd_kcontrol *old; int err; lockdep_assert_held_write(&card->controls_rwsem); id = kcontrol->id; if (id.index > UINT_MAX - kcontrol->count) return -EINVAL; old = snd_ctl_find_id_locked(card, &id); if (!old) { if (mode == CTL_REPLACE) return -EINVAL; } else { if (mode == CTL_ADD_EXCLUSIVE) { dev_err(card->dev, "control %i:%i:%i:%s:%i is already present\n", id.iface, id.device, id.subdevice, id.name, id.index); return -EBUSY; } err = snd_ctl_remove_locked(card, old); if (err < 0) return err; } if (snd_ctl_find_hole(card, kcontrol->count) < 0) return -ENOMEM; list_add_tail(&kcontrol->list, &card->controls); card->controls_count += kcontrol->count; kcontrol->id.numid = card->last_numid + 1; card->last_numid += kcontrol->count; add_hash_entries(card, kcontrol); for (idx = 0; idx < kcontrol->count; idx++) snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_ADD, kcontrol, idx); return 0; } static int snd_ctl_add_replace(struct snd_card *card, struct snd_kcontrol *kcontrol, enum snd_ctl_add_mode mode) { int err = -EINVAL; if (! kcontrol) return err; if (snd_BUG_ON(!card || !kcontrol->info)) goto error; down_write(&card->controls_rwsem); err = __snd_ctl_add_replace(card, kcontrol, mode); up_write(&card->controls_rwsem); if (err < 0) goto error; return 0; error: snd_ctl_free_one(kcontrol); return err; } /** * snd_ctl_add - add the control instance to the card * @card: the card instance * @kcontrol: the control instance to add * * Adds the control instance created via snd_ctl_new() or * snd_ctl_new1() to the given card. Assigns also an unique * numid used for fast search. * * It frees automatically the control which cannot be added. * * Return: Zero if successful, or a negative error code on failure. * */ int snd_ctl_add(struct snd_card *card, struct snd_kcontrol *kcontrol) { return snd_ctl_add_replace(card, kcontrol, CTL_ADD_EXCLUSIVE); } EXPORT_SYMBOL(snd_ctl_add); /** * snd_ctl_replace - replace the control instance of the card * @card: the card instance * @kcontrol: the control instance to replace * @add_on_replace: add the control if not already added * * Replaces the given control. If the given control does not exist * and the add_on_replace flag is set, the control is added. If the * control exists, it is destroyed first. * * It frees automatically the control which cannot be added or replaced. * * Return: Zero if successful, or a negative error code on failure. */ int snd_ctl_replace(struct snd_card *card, struct snd_kcontrol *kcontrol, bool add_on_replace) { return snd_ctl_add_replace(card, kcontrol, add_on_replace ? CTL_ADD_ON_REPLACE : CTL_REPLACE); } EXPORT_SYMBOL(snd_ctl_replace); static int __snd_ctl_remove(struct snd_card *card, struct snd_kcontrol *kcontrol, bool remove_hash) { unsigned int idx; lockdep_assert_held_write(&card->controls_rwsem); if (snd_BUG_ON(!card || !kcontrol)) return -EINVAL; list_del(&kcontrol->list); if (remove_hash) remove_hash_entries(card, kcontrol); card->controls_count -= kcontrol->count; for (idx = 0; idx < kcontrol->count; idx++) snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_REMOVE, kcontrol, idx); snd_ctl_free_one(kcontrol); return 0; } static inline int snd_ctl_remove_locked(struct snd_card *card, struct snd_kcontrol *kcontrol) { return __snd_ctl_remove(card, kcontrol, true); } /** * snd_ctl_remove - remove the control from the card and release it * @card: the card instance * @kcontrol: the control instance to remove * * Removes the control from the card and then releases the instance. * You don't need to call snd_ctl_free_one(). * * Return: 0 if successful, or a negative error code on failure. * * Note that this function takes card->controls_rwsem lock internally. */ int snd_ctl_remove(struct snd_card *card, struct snd_kcontrol *kcontrol) { int ret; down_write(&card->controls_rwsem); ret = snd_ctl_remove_locked(card, kcontrol); up_write(&card->controls_rwsem); return ret; } EXPORT_SYMBOL(snd_ctl_remove); /** * snd_ctl_remove_id - remove the control of the given id and release it * @card: the card instance * @id: the control id to remove * * Finds the control instance with the given id, removes it from the * card list and releases it. * * Return: 0 if successful, or a negative error code on failure. */ int snd_ctl_remove_id(struct snd_card *card, struct snd_ctl_elem_id *id) { struct snd_kcontrol *kctl; int ret; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, id); if (kctl == NULL) { up_write(&card->controls_rwsem); return -ENOENT; } ret = snd_ctl_remove_locked(card, kctl); up_write(&card->controls_rwsem); return ret; } EXPORT_SYMBOL(snd_ctl_remove_id); /** * snd_ctl_remove_user_ctl - remove and release the unlocked user control * @file: active control handle * @id: the control id to remove * * Finds the control instance with the given id, removes it from the * card list and releases it. * * Return: 0 if successful, or a negative error code on failure. */ static int snd_ctl_remove_user_ctl(struct snd_ctl_file * file, struct snd_ctl_elem_id *id) { struct snd_card *card = file->card; struct snd_kcontrol *kctl; int idx, ret; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, id); if (kctl == NULL) { ret = -ENOENT; goto error; } if (!(kctl->vd[0].access & SNDRV_CTL_ELEM_ACCESS_USER)) { ret = -EINVAL; goto error; } for (idx = 0; idx < kctl->count; idx++) if (kctl->vd[idx].owner != NULL && kctl->vd[idx].owner != file) { ret = -EBUSY; goto error; } ret = snd_ctl_remove_locked(card, kctl); error: up_write(&card->controls_rwsem); return ret; } /** * snd_ctl_activate_id - activate/inactivate the control of the given id * @card: the card instance * @id: the control id to activate/inactivate * @active: non-zero to activate * * Finds the control instance with the given id, and activate or * inactivate the control together with notification, if changed. * The given ID data is filled with full information. * * Return: 0 if unchanged, 1 if changed, or a negative error code on failure. */ int snd_ctl_activate_id(struct snd_card *card, struct snd_ctl_elem_id *id, int active) { struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; unsigned int index_offset; int ret; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, id); if (kctl == NULL) { ret = -ENOENT; goto unlock; } index_offset = snd_ctl_get_ioff(kctl, id); vd = &kctl->vd[index_offset]; ret = 0; if (active) { if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_INACTIVE)) goto unlock; vd->access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; } else { if (vd->access & SNDRV_CTL_ELEM_ACCESS_INACTIVE) goto unlock; vd->access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; } snd_ctl_build_ioff(id, kctl, index_offset); downgrade_write(&card->controls_rwsem); snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_INFO, kctl, index_offset); up_read(&card->controls_rwsem); return 1; unlock: up_write(&card->controls_rwsem); return ret; } EXPORT_SYMBOL_GPL(snd_ctl_activate_id); /** * snd_ctl_rename_id - replace the id of a control on the card * @card: the card instance * @src_id: the old id * @dst_id: the new id * * Finds the control with the old id from the card, and replaces the * id with the new one. * * The function tries to keep the already assigned numid while replacing * the rest. * * Note that this function should be used only in the card initialization * phase. Calling after the card instantiation may cause issues with * user-space expecting persistent numids. * * Return: Zero if successful, or a negative error code on failure. */ int snd_ctl_rename_id(struct snd_card *card, struct snd_ctl_elem_id *src_id, struct snd_ctl_elem_id *dst_id) { struct snd_kcontrol *kctl; int saved_numid; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, src_id); if (kctl == NULL) { up_write(&card->controls_rwsem); return -ENOENT; } saved_numid = kctl->id.numid; remove_hash_entries(card, kctl); kctl->id = *dst_id; kctl->id.numid = saved_numid; add_hash_entries(card, kctl); up_write(&card->controls_rwsem); return 0; } EXPORT_SYMBOL(snd_ctl_rename_id); /** * snd_ctl_rename - rename the control on the card * @card: the card instance * @kctl: the control to rename * @name: the new name * * Renames the specified control on the card to the new name. * * Note that this function takes card->controls_rwsem lock internally. */ void snd_ctl_rename(struct snd_card *card, struct snd_kcontrol *kctl, const char *name) { down_write(&card->controls_rwsem); remove_hash_entries(card, kctl); if (strscpy(kctl->id.name, name, sizeof(kctl->id.name)) < 0) pr_warn("ALSA: Renamed control new name '%s' truncated to '%s'\n", name, kctl->id.name); add_hash_entries(card, kctl); up_write(&card->controls_rwsem); } EXPORT_SYMBOL(snd_ctl_rename); #ifndef CONFIG_SND_CTL_FAST_LOOKUP static struct snd_kcontrol * snd_ctl_find_numid_slow(struct snd_card *card, unsigned int numid) { struct snd_kcontrol *kctl; list_for_each_entry(kctl, &card->controls, list) { if (kctl->id.numid <= numid && kctl->id.numid + kctl->count > numid) return kctl; } return NULL; } #endif /* !CONFIG_SND_CTL_FAST_LOOKUP */ /** * snd_ctl_find_numid_locked - find the control instance with the given number-id * @card: the card instance * @numid: the number-id to search * * Finds the control instance with the given number-id from the card. * * The caller must down card->controls_rwsem before calling this function * (if the race condition can happen). * * Return: The pointer of the instance if found, or %NULL if not. */ struct snd_kcontrol * snd_ctl_find_numid_locked(struct snd_card *card, unsigned int numid) { if (snd_BUG_ON(!card || !numid)) return NULL; lockdep_assert_held(&card->controls_rwsem); #ifdef CONFIG_SND_CTL_FAST_LOOKUP return xa_load(&card->ctl_numids, numid); #else return snd_ctl_find_numid_slow(card, numid); #endif } EXPORT_SYMBOL(snd_ctl_find_numid_locked); /** * snd_ctl_find_numid - find the control instance with the given number-id * @card: the card instance * @numid: the number-id to search * * Finds the control instance with the given number-id from the card. * * Return: The pointer of the instance if found, or %NULL if not. * * Note that this function takes card->controls_rwsem lock internally. */ struct snd_kcontrol *snd_ctl_find_numid(struct snd_card *card, unsigned int numid) { struct snd_kcontrol *kctl; down_read(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); up_read(&card->controls_rwsem); return kctl; } EXPORT_SYMBOL(snd_ctl_find_numid); /** * snd_ctl_find_id_locked - find the control instance with the given id * @card: the card instance * @id: the id to search * * Finds the control instance with the given id from the card. * * The caller must down card->controls_rwsem before calling this function * (if the race condition can happen). * * Return: The pointer of the instance if found, or %NULL if not. */ struct snd_kcontrol *snd_ctl_find_id_locked(struct snd_card *card, const struct snd_ctl_elem_id *id) { struct snd_kcontrol *kctl; if (snd_BUG_ON(!card || !id)) return NULL; lockdep_assert_held(&card->controls_rwsem); if (id->numid != 0) return snd_ctl_find_numid_locked(card, id->numid); #ifdef CONFIG_SND_CTL_FAST_LOOKUP kctl = xa_load(&card->ctl_hash, get_ctl_id_hash(id)); if (kctl && elem_id_matches(kctl, id)) return kctl; if (!card->ctl_hash_collision) return NULL; /* we can rely on only hash table */ #endif /* no matching in hash table - try all as the last resort */ list_for_each_entry(kctl, &card->controls, list) if (elem_id_matches(kctl, id)) return kctl; return NULL; } EXPORT_SYMBOL(snd_ctl_find_id_locked); /** * snd_ctl_find_id - find the control instance with the given id * @card: the card instance * @id: the id to search * * Finds the control instance with the given id from the card. * * Return: The pointer of the instance if found, or %NULL if not. * * Note that this function takes card->controls_rwsem lock internally. */ struct snd_kcontrol *snd_ctl_find_id(struct snd_card *card, const struct snd_ctl_elem_id *id) { struct snd_kcontrol *kctl; down_read(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, id); up_read(&card->controls_rwsem); return kctl; } EXPORT_SYMBOL(snd_ctl_find_id); static int snd_ctl_card_info(struct snd_card *card, struct snd_ctl_file * ctl, unsigned int cmd, void __user *arg) { struct snd_ctl_card_info *info; info = kzalloc(sizeof(*info), GFP_KERNEL); if (! info) return -ENOMEM; down_read(&snd_ioctl_rwsem); info->card = card->number; strscpy(info->id, card->id, sizeof(info->id)); strscpy(info->driver, card->driver, sizeof(info->driver)); strscpy(info->name, card->shortname, sizeof(info->name)); strscpy(info->longname, card->longname, sizeof(info->longname)); strscpy(info->mixername, card->mixername, sizeof(info->mixername)); strscpy(info->components, card->components, sizeof(info->components)); up_read(&snd_ioctl_rwsem); if (copy_to_user(arg, info, sizeof(struct snd_ctl_card_info))) { kfree(info); return -EFAULT; } kfree(info); return 0; } static int snd_ctl_elem_list(struct snd_card *card, struct snd_ctl_elem_list *list) { struct snd_kcontrol *kctl; struct snd_ctl_elem_id id; unsigned int offset, space, jidx; int err = 0; offset = list->offset; space = list->space; down_read(&card->controls_rwsem); list->count = card->controls_count; list->used = 0; if (space > 0) { list_for_each_entry(kctl, &card->controls, list) { if (offset >= kctl->count) { offset -= kctl->count; continue; } for (jidx = offset; jidx < kctl->count; jidx++) { snd_ctl_build_ioff(&id, kctl, jidx); if (copy_to_user(list->pids + list->used, &id, sizeof(id))) { err = -EFAULT; goto out; } list->used++; if (!--space) goto out; } offset = 0; } } out: up_read(&card->controls_rwsem); return err; } static int snd_ctl_elem_list_user(struct snd_card *card, struct snd_ctl_elem_list __user *_list) { struct snd_ctl_elem_list list; int err; if (copy_from_user(&list, _list, sizeof(list))) return -EFAULT; err = snd_ctl_elem_list(card, &list); if (err) return err; if (copy_to_user(_list, &list, sizeof(list))) return -EFAULT; return 0; } /* Check whether the given kctl info is valid */ static int snd_ctl_check_elem_info(struct snd_card *card, const struct snd_ctl_elem_info *info) { static const unsigned int max_value_counts[] = { [SNDRV_CTL_ELEM_TYPE_BOOLEAN] = 128, [SNDRV_CTL_ELEM_TYPE_INTEGER] = 128, [SNDRV_CTL_ELEM_TYPE_ENUMERATED] = 128, [SNDRV_CTL_ELEM_TYPE_BYTES] = 512, [SNDRV_CTL_ELEM_TYPE_IEC958] = 1, [SNDRV_CTL_ELEM_TYPE_INTEGER64] = 64, }; if (info->type < SNDRV_CTL_ELEM_TYPE_BOOLEAN || info->type > SNDRV_CTL_ELEM_TYPE_INTEGER64) { if (card) dev_err(card->dev, "control %i:%i:%i:%s:%i: invalid type %d\n", info->id.iface, info->id.device, info->id.subdevice, info->id.name, info->id.index, info->type); return -EINVAL; } if (info->type == SNDRV_CTL_ELEM_TYPE_ENUMERATED && info->value.enumerated.items == 0) { if (card) dev_err(card->dev, "control %i:%i:%i:%s:%i: zero enum items\n", info->id.iface, info->id.device, info->id.subdevice, info->id.name, info->id.index); return -EINVAL; } if (info->count > max_value_counts[info->type]) { if (card) dev_err(card->dev, "control %i:%i:%i:%s:%i: invalid count %d\n", info->id.iface, info->id.device, info->id.subdevice, info->id.name, info->id.index, info->count); return -EINVAL; } return 0; } /* The capacity of struct snd_ctl_elem_value.value.*/ static const unsigned int value_sizes[] = { [SNDRV_CTL_ELEM_TYPE_BOOLEAN] = sizeof(long), [SNDRV_CTL_ELEM_TYPE_INTEGER] = sizeof(long), [SNDRV_CTL_ELEM_TYPE_ENUMERATED] = sizeof(unsigned int), [SNDRV_CTL_ELEM_TYPE_BYTES] = sizeof(unsigned char), [SNDRV_CTL_ELEM_TYPE_IEC958] = sizeof(struct snd_aes_iec958), [SNDRV_CTL_ELEM_TYPE_INTEGER64] = sizeof(long long), }; /* fill the remaining snd_ctl_elem_value data with the given pattern */ static void fill_remaining_elem_value(struct snd_ctl_elem_value *control, struct snd_ctl_elem_info *info, u32 pattern) { size_t offset = value_sizes[info->type] * info->count; offset = DIV_ROUND_UP(offset, sizeof(u32)); memset32((u32 *)control->value.bytes.data + offset, pattern, sizeof(control->value) / sizeof(u32) - offset); } /* check whether the given integer ctl value is valid */ static int sanity_check_int_value(struct snd_card *card, const struct snd_ctl_elem_value *control, const struct snd_ctl_elem_info *info, int i, bool print_error) { long long lval, lmin, lmax, lstep; u64 rem; switch (info->type) { default: case SNDRV_CTL_ELEM_TYPE_BOOLEAN: lval = control->value.integer.value[i]; lmin = 0; lmax = 1; lstep = 0; break; case SNDRV_CTL_ELEM_TYPE_INTEGER: lval = control->value.integer.value[i]; lmin = info->value.integer.min; lmax = info->value.integer.max; lstep = info->value.integer.step; break; case SNDRV_CTL_ELEM_TYPE_INTEGER64: lval = control->value.integer64.value[i]; lmin = info->value.integer64.min; lmax = info->value.integer64.max; lstep = info->value.integer64.step; break; case SNDRV_CTL_ELEM_TYPE_ENUMERATED: lval = control->value.enumerated.item[i]; lmin = 0; lmax = info->value.enumerated.items - 1; lstep = 0; break; } if (lval < lmin || lval > lmax) { if (print_error) dev_err(card->dev, "control %i:%i:%i:%s:%i: value out of range %lld (%lld/%lld) at count %i\n", control->id.iface, control->id.device, control->id.subdevice, control->id.name, control->id.index, lval, lmin, lmax, i); return -EINVAL; } if (lstep) { div64_u64_rem(lval, lstep, &rem); if (rem) { if (print_error) dev_err(card->dev, "control %i:%i:%i:%s:%i: unaligned value %lld (step %lld) at count %i\n", control->id.iface, control->id.device, control->id.subdevice, control->id.name, control->id.index, lval, lstep, i); return -EINVAL; } } return 0; } /* check whether the all input values are valid for the given elem value */ static int sanity_check_input_values(struct snd_card *card, const struct snd_ctl_elem_value *control, const struct snd_ctl_elem_info *info, bool print_error) { int i, ret; switch (info->type) { case SNDRV_CTL_ELEM_TYPE_BOOLEAN: case SNDRV_CTL_ELEM_TYPE_INTEGER: case SNDRV_CTL_ELEM_TYPE_INTEGER64: case SNDRV_CTL_ELEM_TYPE_ENUMERATED: for (i = 0; i < info->count; i++) { ret = sanity_check_int_value(card, control, info, i, print_error); if (ret < 0) return ret; } break; default: break; } return 0; } /* perform sanity checks to the given snd_ctl_elem_value object */ static int sanity_check_elem_value(struct snd_card *card, const struct snd_ctl_elem_value *control, const struct snd_ctl_elem_info *info, u32 pattern) { size_t offset; int ret; u32 *p; ret = sanity_check_input_values(card, control, info, true); if (ret < 0) return ret; /* check whether the remaining area kept untouched */ offset = value_sizes[info->type] * info->count; offset = DIV_ROUND_UP(offset, sizeof(u32)); p = (u32 *)control->value.bytes.data + offset; for (; offset < sizeof(control->value) / sizeof(u32); offset++, p++) { if (*p != pattern) { ret = -EINVAL; break; } *p = 0; /* clear the checked area */ } return ret; } static int __snd_ctl_elem_info(struct snd_card *card, struct snd_kcontrol *kctl, struct snd_ctl_elem_info *info, struct snd_ctl_file *ctl) { struct snd_kcontrol_volatile *vd; unsigned int index_offset; int result; #ifdef CONFIG_SND_DEBUG info->access = 0; #endif result = snd_power_ref_and_wait(card); if (!result) result = kctl->info(kctl, info); snd_power_unref(card); if (result >= 0) { snd_BUG_ON(info->access); index_offset = snd_ctl_get_ioff(kctl, &info->id); vd = &kctl->vd[index_offset]; snd_ctl_build_ioff(&info->id, kctl, index_offset); info->access = vd->access; if (vd->owner) { info->access |= SNDRV_CTL_ELEM_ACCESS_LOCK; if (vd->owner == ctl) info->access |= SNDRV_CTL_ELEM_ACCESS_OWNER; info->owner = pid_vnr(vd->owner->pid); } else { info->owner = -1; } if (!snd_ctl_skip_validation(info) && snd_ctl_check_elem_info(card, info) < 0) result = -EINVAL; } return result; } static int snd_ctl_elem_info(struct snd_ctl_file *ctl, struct snd_ctl_elem_info *info) { struct snd_card *card = ctl->card; struct snd_kcontrol *kctl; int result; down_read(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, &info->id); if (kctl == NULL) result = -ENOENT; else result = __snd_ctl_elem_info(card, kctl, info, ctl); up_read(&card->controls_rwsem); return result; } static int snd_ctl_elem_info_user(struct snd_ctl_file *ctl, struct snd_ctl_elem_info __user *_info) { struct snd_ctl_elem_info info; int result; if (copy_from_user(&info, _info, sizeof(info))) return -EFAULT; result = snd_ctl_elem_info(ctl, &info); if (result < 0) return result; /* drop internal access flags */ info.access &= ~(SNDRV_CTL_ELEM_ACCESS_SKIP_CHECK| SNDRV_CTL_ELEM_ACCESS_LED_MASK); if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return result; } static int snd_ctl_elem_read(struct snd_card *card, struct snd_ctl_elem_value *control) { struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; unsigned int index_offset; struct snd_ctl_elem_info info; const u32 pattern = 0xdeadbeef; int ret; down_read(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, &control->id); if (kctl == NULL) { ret = -ENOENT; goto unlock; } index_offset = snd_ctl_get_ioff(kctl, &control->id); vd = &kctl->vd[index_offset]; if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_READ) || kctl->get == NULL) { ret = -EPERM; goto unlock; } snd_ctl_build_ioff(&control->id, kctl, index_offset); #ifdef CONFIG_SND_CTL_DEBUG /* info is needed only for validation */ memset(&info, 0, sizeof(info)); info.id = control->id; ret = __snd_ctl_elem_info(card, kctl, &info, NULL); if (ret < 0) goto unlock; #endif if (!snd_ctl_skip_validation(&info)) fill_remaining_elem_value(control, &info, pattern); ret = snd_power_ref_and_wait(card); if (!ret) ret = kctl->get(kctl, control); snd_power_unref(card); if (ret < 0) goto unlock; if (!snd_ctl_skip_validation(&info) && sanity_check_elem_value(card, control, &info, pattern) < 0) { dev_err(card->dev, "control %i:%i:%i:%s:%i: access overflow\n", control->id.iface, control->id.device, control->id.subdevice, control->id.name, control->id.index); ret = -EINVAL; goto unlock; } unlock: up_read(&card->controls_rwsem); return ret; } static int snd_ctl_elem_read_user(struct snd_card *card, struct snd_ctl_elem_value __user *_control) { struct snd_ctl_elem_value *control; int result; control = memdup_user(_control, sizeof(*control)); if (IS_ERR(control)) return PTR_ERR(control); result = snd_ctl_elem_read(card, control); if (result < 0) goto error; if (copy_to_user(_control, control, sizeof(*control))) result = -EFAULT; error: kfree(control); return result; } static int snd_ctl_elem_write(struct snd_card *card, struct snd_ctl_file *file, struct snd_ctl_elem_value *control) { struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; unsigned int index_offset; int result; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, &control->id); if (kctl == NULL) { up_write(&card->controls_rwsem); return -ENOENT; } index_offset = snd_ctl_get_ioff(kctl, &control->id); vd = &kctl->vd[index_offset]; if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_WRITE) || kctl->put == NULL || (file && vd->owner && vd->owner != file)) { up_write(&card->controls_rwsem); return -EPERM; } snd_ctl_build_ioff(&control->id, kctl, index_offset); result = snd_power_ref_and_wait(card); /* validate input values */ if (IS_ENABLED(CONFIG_SND_CTL_INPUT_VALIDATION) && !result) { struct snd_ctl_elem_info info; memset(&info, 0, sizeof(info)); info.id = control->id; result = __snd_ctl_elem_info(card, kctl, &info, NULL); if (!result) result = sanity_check_input_values(card, control, &info, false); } if (!result) result = kctl->put(kctl, control); snd_power_unref(card); if (result < 0) { up_write(&card->controls_rwsem); return result; } if (result > 0) { downgrade_write(&card->controls_rwsem); snd_ctl_notify_one(card, SNDRV_CTL_EVENT_MASK_VALUE, kctl, index_offset); up_read(&card->controls_rwsem); } else { up_write(&card->controls_rwsem); } return 0; } static int snd_ctl_elem_write_user(struct snd_ctl_file *file, struct snd_ctl_elem_value __user *_control) { struct snd_ctl_elem_value *control; struct snd_card *card; int result; control = memdup_user(_control, sizeof(*control)); if (IS_ERR(control)) return PTR_ERR(control); card = file->card; result = snd_ctl_elem_write(card, file, control); if (result < 0) goto error; if (copy_to_user(_control, control, sizeof(*control))) result = -EFAULT; error: kfree(control); return result; } static int snd_ctl_elem_lock(struct snd_ctl_file *file, struct snd_ctl_elem_id __user *_id) { struct snd_card *card = file->card; struct snd_ctl_elem_id id; struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; int result; if (copy_from_user(&id, _id, sizeof(id))) return -EFAULT; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, &id); if (kctl == NULL) { result = -ENOENT; } else { vd = &kctl->vd[snd_ctl_get_ioff(kctl, &id)]; if (vd->owner != NULL) result = -EBUSY; else { vd->owner = file; result = 0; } } up_write(&card->controls_rwsem); return result; } static int snd_ctl_elem_unlock(struct snd_ctl_file *file, struct snd_ctl_elem_id __user *_id) { struct snd_card *card = file->card; struct snd_ctl_elem_id id; struct snd_kcontrol *kctl; struct snd_kcontrol_volatile *vd; int result; if (copy_from_user(&id, _id, sizeof(id))) return -EFAULT; down_write(&card->controls_rwsem); kctl = snd_ctl_find_id_locked(card, &id); if (kctl == NULL) { result = -ENOENT; } else { vd = &kctl->vd[snd_ctl_get_ioff(kctl, &id)]; if (vd->owner == NULL) result = -EINVAL; else if (vd->owner != file) result = -EPERM; else { vd->owner = NULL; result = 0; } } up_write(&card->controls_rwsem); return result; } struct user_element { struct snd_ctl_elem_info info; struct snd_card *card; char *elem_data; /* element data */ unsigned long elem_data_size; /* size of element data in bytes */ void *tlv_data; /* TLV data */ unsigned long tlv_data_size; /* TLV data size */ void *priv_data; /* private data (like strings for enumerated type) */ }; // check whether the addition (in bytes) of user ctl element may overflow the limit. static bool check_user_elem_overflow(struct snd_card *card, ssize_t add) { return (ssize_t)card->user_ctl_alloc_size + add > max_user_ctl_alloc_size; } static int snd_ctl_elem_user_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { struct user_element *ue = kcontrol->private_data; unsigned int offset; offset = snd_ctl_get_ioff(kcontrol, &uinfo->id); *uinfo = ue->info; snd_ctl_build_ioff(&uinfo->id, kcontrol, offset); return 0; } static int snd_ctl_elem_user_enum_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { struct user_element *ue = kcontrol->private_data; const char *names; unsigned int item; unsigned int offset; item = uinfo->value.enumerated.item; offset = snd_ctl_get_ioff(kcontrol, &uinfo->id); *uinfo = ue->info; snd_ctl_build_ioff(&uinfo->id, kcontrol, offset); item = min(item, uinfo->value.enumerated.items - 1); uinfo->value.enumerated.item = item; names = ue->priv_data; for (; item > 0; --item) names += strlen(names) + 1; strcpy(uinfo->value.enumerated.name, names); return 0; } static int snd_ctl_elem_user_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct user_element *ue = kcontrol->private_data; unsigned int size = ue->elem_data_size; char *src = ue->elem_data + snd_ctl_get_ioff(kcontrol, &ucontrol->id) * size; memcpy(&ucontrol->value, src, size); return 0; } static int snd_ctl_elem_user_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { int change; struct user_element *ue = kcontrol->private_data; unsigned int size = ue->elem_data_size; char *dst = ue->elem_data + snd_ctl_get_ioff(kcontrol, &ucontrol->id) * size; change = memcmp(&ucontrol->value, dst, size) != 0; if (change) memcpy(dst, &ucontrol->value, size); return change; } /* called in controls_rwsem write lock */ static int replace_user_tlv(struct snd_kcontrol *kctl, unsigned int __user *buf, unsigned int size) { struct user_element *ue = kctl->private_data; unsigned int *container; unsigned int mask = 0; int i; int change; lockdep_assert_held_write(&ue->card->controls_rwsem); if (size > 1024 * 128) /* sane value */ return -EINVAL; // does the TLV size change cause overflow? if (check_user_elem_overflow(ue->card, (ssize_t)(size - ue->tlv_data_size))) return -ENOMEM; container = vmemdup_user(buf, size); if (IS_ERR(container)) return PTR_ERR(container); change = ue->tlv_data_size != size; if (!change) change = memcmp(ue->tlv_data, container, size) != 0; if (!change) { kvfree(container); return 0; } if (ue->tlv_data == NULL) { /* Now TLV data is available. */ for (i = 0; i < kctl->count; ++i) kctl->vd[i].access |= SNDRV_CTL_ELEM_ACCESS_TLV_READ; mask = SNDRV_CTL_EVENT_MASK_INFO; } else { ue->card->user_ctl_alloc_size -= ue->tlv_data_size; ue->tlv_data_size = 0; kvfree(ue->tlv_data); } ue->tlv_data = container; ue->tlv_data_size = size; // decremented at private_free. ue->card->user_ctl_alloc_size += size; mask |= SNDRV_CTL_EVENT_MASK_TLV; for (i = 0; i < kctl->count; ++i) snd_ctl_notify_one(ue->card, mask, kctl, i); return change; } static int read_user_tlv(struct snd_kcontrol *kctl, unsigned int __user *buf, unsigned int size) { struct user_element *ue = kctl->private_data; if (ue->tlv_data_size == 0 || ue->tlv_data == NULL) return -ENXIO; if (size < ue->tlv_data_size) return -ENOSPC; if (copy_to_user(buf, ue->tlv_data, ue->tlv_data_size)) return -EFAULT; return 0; } static int snd_ctl_elem_user_tlv(struct snd_kcontrol *kctl, int op_flag, unsigned int size, unsigned int __user *buf) { if (op_flag == SNDRV_CTL_TLV_OP_WRITE) return replace_user_tlv(kctl, buf, size); else return read_user_tlv(kctl, buf, size); } /* called in controls_rwsem write lock */ static int snd_ctl_elem_init_enum_names(struct user_element *ue) { char *names, *p; size_t buf_len, name_len; unsigned int i; const uintptr_t user_ptrval = ue->info.value.enumerated.names_ptr; lockdep_assert_held_write(&ue->card->controls_rwsem); buf_len = ue->info.value.enumerated.names_length; if (buf_len > 64 * 1024) return -EINVAL; if (check_user_elem_overflow(ue->card, buf_len)) return -ENOMEM; names = vmemdup_user((const void __user *)user_ptrval, buf_len); if (IS_ERR(names)) return PTR_ERR(names); /* check that there are enough valid names */ p = names; for (i = 0; i < ue->info.value.enumerated.items; ++i) { name_len = strnlen(p, buf_len); if (name_len == 0 || name_len >= 64 || name_len == buf_len) { kvfree(names); return -EINVAL; } p += name_len + 1; buf_len -= name_len + 1; } ue->priv_data = names; ue->info.value.enumerated.names_ptr = 0; // increment the allocation size; decremented again at private_free. ue->card->user_ctl_alloc_size += ue->info.value.enumerated.names_length; return 0; } static size_t compute_user_elem_size(size_t size, unsigned int count) { return sizeof(struct user_element) + size * count; } static void snd_ctl_elem_user_free(struct snd_kcontrol *kcontrol) { struct user_element *ue = kcontrol->private_data; // decrement the allocation size. ue->card->user_ctl_alloc_size -= compute_user_elem_size(ue->elem_data_size, kcontrol->count); ue->card->user_ctl_alloc_size -= ue->tlv_data_size; if (ue->priv_data) ue->card->user_ctl_alloc_size -= ue->info.value.enumerated.names_length; kvfree(ue->tlv_data); kvfree(ue->priv_data); kfree(ue); } static int snd_ctl_elem_add(struct snd_ctl_file *file, struct snd_ctl_elem_info *info, int replace) { struct snd_card *card = file->card; struct snd_kcontrol *kctl; unsigned int count; unsigned int access; long private_size; size_t alloc_size; struct user_element *ue; unsigned int offset; int err; if (!*info->id.name) return -EINVAL; if (strnlen(info->id.name, sizeof(info->id.name)) >= sizeof(info->id.name)) return -EINVAL; /* Delete a control to replace them if needed. */ if (replace) { info->id.numid = 0; err = snd_ctl_remove_user_ctl(file, &info->id); if (err) return err; } /* Check the number of elements for this userspace control. */ count = info->owner; if (count == 0) count = 1; /* Arrange access permissions if needed. */ access = info->access; if (access == 0) access = SNDRV_CTL_ELEM_ACCESS_READWRITE; access &= (SNDRV_CTL_ELEM_ACCESS_READWRITE | SNDRV_CTL_ELEM_ACCESS_INACTIVE | SNDRV_CTL_ELEM_ACCESS_TLV_WRITE); /* In initial state, nothing is available as TLV container. */ if (access & SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) access |= SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK; access |= SNDRV_CTL_ELEM_ACCESS_USER; /* * Check information and calculate the size of data specific to * this userspace control. */ /* pass NULL to card for suppressing error messages */ err = snd_ctl_check_elem_info(NULL, info); if (err < 0) return err; /* user-space control doesn't allow zero-size data */ if (info->count < 1) return -EINVAL; private_size = value_sizes[info->type] * info->count; alloc_size = compute_user_elem_size(private_size, count); down_write(&card->controls_rwsem); if (check_user_elem_overflow(card, alloc_size)) { err = -ENOMEM; goto unlock; } /* * Keep memory object for this userspace control. After passing this * code block, the instance should be freed by snd_ctl_free_one(). * * Note that these elements in this control are locked. */ err = snd_ctl_new(&kctl, count, access, file); if (err < 0) goto unlock; memcpy(&kctl->id, &info->id, sizeof(kctl->id)); ue = kzalloc(alloc_size, GFP_KERNEL); if (!ue) { kfree(kctl); err = -ENOMEM; goto unlock; } kctl->private_data = ue; kctl->private_free = snd_ctl_elem_user_free; // increment the allocated size; decremented again at private_free. card->user_ctl_alloc_size += alloc_size; /* Set private data for this userspace control. */ ue->card = card; ue->info = *info; ue->info.access = 0; ue->elem_data = (char *)ue + sizeof(*ue); ue->elem_data_size = private_size; if (ue->info.type == SNDRV_CTL_ELEM_TYPE_ENUMERATED) { err = snd_ctl_elem_init_enum_names(ue); if (err < 0) { snd_ctl_free_one(kctl); goto unlock; } } /* Set callback functions. */ if (info->type == SNDRV_CTL_ELEM_TYPE_ENUMERATED) kctl->info = snd_ctl_elem_user_enum_info; else kctl->info = snd_ctl_elem_user_info; if (access & SNDRV_CTL_ELEM_ACCESS_READ) kctl->get = snd_ctl_elem_user_get; if (access & SNDRV_CTL_ELEM_ACCESS_WRITE) kctl->put = snd_ctl_elem_user_put; if (access & SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) kctl->tlv.c = snd_ctl_elem_user_tlv; /* This function manage to free the instance on failure. */ err = __snd_ctl_add_replace(card, kctl, CTL_ADD_EXCLUSIVE); if (err < 0) { snd_ctl_free_one(kctl); goto unlock; } offset = snd_ctl_get_ioff(kctl, &info->id); snd_ctl_build_ioff(&info->id, kctl, offset); /* * Here we cannot fill any field for the number of elements added by * this operation because there're no specific fields. The usage of * 'owner' field for this purpose may cause any bugs to userspace * applications because the field originally means PID of a process * which locks the element. */ unlock: up_write(&card->controls_rwsem); return err; } static int snd_ctl_elem_add_user(struct snd_ctl_file *file, struct snd_ctl_elem_info __user *_info, int replace) { struct snd_ctl_elem_info info; int err; if (copy_from_user(&info, _info, sizeof(info))) return -EFAULT; err = snd_ctl_elem_add(file, &info, replace); if (err < 0) return err; if (copy_to_user(_info, &info, sizeof(info))) { snd_ctl_remove_user_ctl(file, &info.id); return -EFAULT; } return 0; } static int snd_ctl_elem_remove(struct snd_ctl_file *file, struct snd_ctl_elem_id __user *_id) { struct snd_ctl_elem_id id; if (copy_from_user(&id, _id, sizeof(id))) return -EFAULT; return snd_ctl_remove_user_ctl(file, &id); } static int snd_ctl_subscribe_events(struct snd_ctl_file *file, int __user *ptr) { int subscribe; if (get_user(subscribe, ptr)) return -EFAULT; if (subscribe < 0) { subscribe = file->subscribed; if (put_user(subscribe, ptr)) return -EFAULT; return 0; } if (subscribe) { file->subscribed = 1; return 0; } else if (file->subscribed) { snd_ctl_empty_read_queue(file); file->subscribed = 0; } return 0; } static int call_tlv_handler(struct snd_ctl_file *file, int op_flag, struct snd_kcontrol *kctl, struct snd_ctl_elem_id *id, unsigned int __user *buf, unsigned int size) { static const struct { int op; int perm; } pairs[] = { {SNDRV_CTL_TLV_OP_READ, SNDRV_CTL_ELEM_ACCESS_TLV_READ}, {SNDRV_CTL_TLV_OP_WRITE, SNDRV_CTL_ELEM_ACCESS_TLV_WRITE}, {SNDRV_CTL_TLV_OP_CMD, SNDRV_CTL_ELEM_ACCESS_TLV_COMMAND}, }; struct snd_kcontrol_volatile *vd = &kctl->vd[snd_ctl_get_ioff(kctl, id)]; int i, ret; /* Check support of the request for this element. */ for (i = 0; i < ARRAY_SIZE(pairs); ++i) { if (op_flag == pairs[i].op && (vd->access & pairs[i].perm)) break; } if (i == ARRAY_SIZE(pairs)) return -ENXIO; if (kctl->tlv.c == NULL) return -ENXIO; /* Write and command operations are not allowed for locked element. */ if (op_flag != SNDRV_CTL_TLV_OP_READ && vd->owner != NULL && vd->owner != file) return -EPERM; ret = snd_power_ref_and_wait(file->card); if (!ret) ret = kctl->tlv.c(kctl, op_flag, size, buf); snd_power_unref(file->card); return ret; } static int read_tlv_buf(struct snd_kcontrol *kctl, struct snd_ctl_elem_id *id, unsigned int __user *buf, unsigned int size) { struct snd_kcontrol_volatile *vd = &kctl->vd[snd_ctl_get_ioff(kctl, id)]; unsigned int len; if (!(vd->access & SNDRV_CTL_ELEM_ACCESS_TLV_READ)) return -ENXIO; if (kctl->tlv.p == NULL) return -ENXIO; len = sizeof(unsigned int) * 2 + kctl->tlv.p[1]; if (size < len) return -ENOMEM; if (copy_to_user(buf, kctl->tlv.p, len)) return -EFAULT; return 0; } static int snd_ctl_tlv_ioctl(struct snd_ctl_file *file, struct snd_ctl_tlv __user *buf, int op_flag) { struct snd_ctl_tlv header; unsigned int __user *container; unsigned int container_size; struct snd_kcontrol *kctl; struct snd_ctl_elem_id id; struct snd_kcontrol_volatile *vd; lockdep_assert_held(&file->card->controls_rwsem); if (copy_from_user(&header, buf, sizeof(header))) return -EFAULT; /* In design of control core, numerical ID starts at 1. */ if (header.numid == 0) return -EINVAL; /* At least, container should include type and length fields. */ if (header.length < sizeof(unsigned int) * 2) return -EINVAL; container_size = header.length; container = buf->tlv; kctl = snd_ctl_find_numid_locked(file->card, header.numid); if (kctl == NULL) return -ENOENT; /* Calculate index of the element in this set. */ id = kctl->id; snd_ctl_build_ioff(&id, kctl, header.numid - id.numid); vd = &kctl->vd[snd_ctl_get_ioff(kctl, &id)]; if (vd->access & SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK) { return call_tlv_handler(file, op_flag, kctl, &id, container, container_size); } else { if (op_flag == SNDRV_CTL_TLV_OP_READ) { return read_tlv_buf(kctl, &id, container, container_size); } } /* Not supported. */ return -ENXIO; } static long snd_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_ctl_file *ctl; struct snd_card *card; struct snd_kctl_ioctl *p; void __user *argp = (void __user *)arg; int __user *ip = argp; int err; ctl = file->private_data; card = ctl->card; if (snd_BUG_ON(!card)) return -ENXIO; switch (cmd) { case SNDRV_CTL_IOCTL_PVERSION: return put_user(SNDRV_CTL_VERSION, ip) ? -EFAULT : 0; case SNDRV_CTL_IOCTL_CARD_INFO: return snd_ctl_card_info(card, ctl, cmd, argp); case SNDRV_CTL_IOCTL_ELEM_LIST: return snd_ctl_elem_list_user(card, argp); case SNDRV_CTL_IOCTL_ELEM_INFO: return snd_ctl_elem_info_user(ctl, argp); case SNDRV_CTL_IOCTL_ELEM_READ: return snd_ctl_elem_read_user(card, argp); case SNDRV_CTL_IOCTL_ELEM_WRITE: return snd_ctl_elem_write_user(ctl, argp); case SNDRV_CTL_IOCTL_ELEM_LOCK: return snd_ctl_elem_lock(ctl, argp); case SNDRV_CTL_IOCTL_ELEM_UNLOCK: return snd_ctl_elem_unlock(ctl, argp); case SNDRV_CTL_IOCTL_ELEM_ADD: return snd_ctl_elem_add_user(ctl, argp, 0); case SNDRV_CTL_IOCTL_ELEM_REPLACE: return snd_ctl_elem_add_user(ctl, argp, 1); case SNDRV_CTL_IOCTL_ELEM_REMOVE: return snd_ctl_elem_remove(ctl, argp); case SNDRV_CTL_IOCTL_SUBSCRIBE_EVENTS: return snd_ctl_subscribe_events(ctl, ip); case SNDRV_CTL_IOCTL_TLV_READ: down_read(&ctl->card->controls_rwsem); err = snd_ctl_tlv_ioctl(ctl, argp, SNDRV_CTL_TLV_OP_READ); up_read(&ctl->card->controls_rwsem); return err; case SNDRV_CTL_IOCTL_TLV_WRITE: down_write(&ctl->card->controls_rwsem); err = snd_ctl_tlv_ioctl(ctl, argp, SNDRV_CTL_TLV_OP_WRITE); up_write(&ctl->card->controls_rwsem); return err; case SNDRV_CTL_IOCTL_TLV_COMMAND: down_write(&ctl->card->controls_rwsem); err = snd_ctl_tlv_ioctl(ctl, argp, SNDRV_CTL_TLV_OP_CMD); up_write(&ctl->card->controls_rwsem); return err; case SNDRV_CTL_IOCTL_POWER: return -ENOPROTOOPT; case SNDRV_CTL_IOCTL_POWER_STATE: return put_user(SNDRV_CTL_POWER_D0, ip) ? -EFAULT : 0; } down_read(&snd_ioctl_rwsem); list_for_each_entry(p, &snd_control_ioctls, list) { err = p->fioctl(card, ctl, cmd, arg); if (err != -ENOIOCTLCMD) { up_read(&snd_ioctl_rwsem); return err; } } up_read(&snd_ioctl_rwsem); dev_dbg(card->dev, "unknown ioctl = 0x%x\n", cmd); return -ENOTTY; } static ssize_t snd_ctl_read(struct file *file, char __user *buffer, size_t count, loff_t * offset) { struct snd_ctl_file *ctl; int err = 0; ssize_t result = 0; ctl = file->private_data; if (snd_BUG_ON(!ctl || !ctl->card)) return -ENXIO; if (!ctl->subscribed) return -EBADFD; if (count < sizeof(struct snd_ctl_event)) return -EINVAL; spin_lock_irq(&ctl->read_lock); while (count >= sizeof(struct snd_ctl_event)) { struct snd_ctl_event ev; struct snd_kctl_event *kev; while (list_empty(&ctl->events)) { wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { err = -EAGAIN; goto __end_lock; } init_waitqueue_entry(&wait, current); add_wait_queue(&ctl->change_sleep, &wait); set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&ctl->read_lock); schedule(); remove_wait_queue(&ctl->change_sleep, &wait); if (ctl->card->shutdown) return -ENODEV; if (signal_pending(current)) return -ERESTARTSYS; spin_lock_irq(&ctl->read_lock); } kev = snd_kctl_event(ctl->events.next); ev.type = SNDRV_CTL_EVENT_ELEM; ev.data.elem.mask = kev->mask; ev.data.elem.id = kev->id; list_del(&kev->list); spin_unlock_irq(&ctl->read_lock); kfree(kev); if (copy_to_user(buffer, &ev, sizeof(struct snd_ctl_event))) { err = -EFAULT; goto __end; } spin_lock_irq(&ctl->read_lock); buffer += sizeof(struct snd_ctl_event); count -= sizeof(struct snd_ctl_event); result += sizeof(struct snd_ctl_event); } __end_lock: spin_unlock_irq(&ctl->read_lock); __end: return result > 0 ? result : err; } static __poll_t snd_ctl_poll(struct file *file, poll_table * wait) { __poll_t mask; struct snd_ctl_file *ctl; ctl = file->private_data; if (!ctl->subscribed) return 0; poll_wait(file, &ctl->change_sleep, wait); mask = 0; if (!list_empty(&ctl->events)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } /* * register the device-specific control-ioctls. * called from each device manager like pcm.c, hwdep.c, etc. */ static int _snd_ctl_register_ioctl(snd_kctl_ioctl_func_t fcn, struct list_head *lists) { struct snd_kctl_ioctl *pn; pn = kzalloc(sizeof(struct snd_kctl_ioctl), GFP_KERNEL); if (pn == NULL) return -ENOMEM; pn->fioctl = fcn; down_write(&snd_ioctl_rwsem); list_add_tail(&pn->list, lists); up_write(&snd_ioctl_rwsem); return 0; } /** * snd_ctl_register_ioctl - register the device-specific control-ioctls * @fcn: ioctl callback function * * called from each device manager like pcm.c, hwdep.c, etc. * * Return: zero if successful, or a negative error code */ int snd_ctl_register_ioctl(snd_kctl_ioctl_func_t fcn) { return _snd_ctl_register_ioctl(fcn, &snd_control_ioctls); } EXPORT_SYMBOL(snd_ctl_register_ioctl); #ifdef CONFIG_COMPAT /** * snd_ctl_register_ioctl_compat - register the device-specific 32bit compat * control-ioctls * @fcn: ioctl callback function * * Return: zero if successful, or a negative error code */ int snd_ctl_register_ioctl_compat(snd_kctl_ioctl_func_t fcn) { return _snd_ctl_register_ioctl(fcn, &snd_control_compat_ioctls); } EXPORT_SYMBOL(snd_ctl_register_ioctl_compat); #endif /* * de-register the device-specific control-ioctls. */ static int _snd_ctl_unregister_ioctl(snd_kctl_ioctl_func_t fcn, struct list_head *lists) { struct snd_kctl_ioctl *p; if (snd_BUG_ON(!fcn)) return -EINVAL; down_write(&snd_ioctl_rwsem); list_for_each_entry(p, lists, list) { if (p->fioctl == fcn) { list_del(&p->list); up_write(&snd_ioctl_rwsem); kfree(p); return 0; } } up_write(&snd_ioctl_rwsem); snd_BUG(); return -EINVAL; } /** * snd_ctl_unregister_ioctl - de-register the device-specific control-ioctls * @fcn: ioctl callback function to unregister * * Return: zero if successful, or a negative error code */ int snd_ctl_unregister_ioctl(snd_kctl_ioctl_func_t fcn) { return _snd_ctl_unregister_ioctl(fcn, &snd_control_ioctls); } EXPORT_SYMBOL(snd_ctl_unregister_ioctl); #ifdef CONFIG_COMPAT /** * snd_ctl_unregister_ioctl_compat - de-register the device-specific compat * 32bit control-ioctls * @fcn: ioctl callback function to unregister * * Return: zero if successful, or a negative error code */ int snd_ctl_unregister_ioctl_compat(snd_kctl_ioctl_func_t fcn) { return _snd_ctl_unregister_ioctl(fcn, &snd_control_compat_ioctls); } EXPORT_SYMBOL(snd_ctl_unregister_ioctl_compat); #endif static int snd_ctl_fasync(int fd, struct file * file, int on) { struct snd_ctl_file *ctl; ctl = file->private_data; return snd_fasync_helper(fd, file, on, &ctl->fasync); } /* return the preferred subdevice number if already assigned; * otherwise return -1 */ int snd_ctl_get_preferred_subdevice(struct snd_card *card, int type) { struct snd_ctl_file *kctl; int subdevice = -1; unsigned long flags; read_lock_irqsave(&card->ctl_files_rwlock, flags); list_for_each_entry(kctl, &card->ctl_files, list) { if (kctl->pid == task_pid(current)) { subdevice = kctl->preferred_subdevice[type]; if (subdevice != -1) break; } } read_unlock_irqrestore(&card->ctl_files_rwlock, flags); return subdevice; } EXPORT_SYMBOL_GPL(snd_ctl_get_preferred_subdevice); /* * ioctl32 compat */ #ifdef CONFIG_COMPAT #include "control_compat.c" #else #define snd_ctl_ioctl_compat NULL #endif /* * control layers (audio LED etc.) */ /** * snd_ctl_request_layer - request to use the layer * @module_name: Name of the kernel module (NULL == build-in) * * Return: zero if successful, or an error code when the module cannot be loaded */ int snd_ctl_request_layer(const char *module_name) { struct snd_ctl_layer_ops *lops; if (module_name == NULL) return 0; down_read(&snd_ctl_layer_rwsem); for (lops = snd_ctl_layer; lops; lops = lops->next) if (strcmp(lops->module_name, module_name) == 0) break; up_read(&snd_ctl_layer_rwsem); if (lops) return 0; return request_module(module_name); } EXPORT_SYMBOL_GPL(snd_ctl_request_layer); /** * snd_ctl_register_layer - register new control layer * @lops: operation structure * * The new layer can track all control elements and do additional * operations on top (like audio LED handling). */ void snd_ctl_register_layer(struct snd_ctl_layer_ops *lops) { struct snd_card *card; int card_number; down_write(&snd_ctl_layer_rwsem); lops->next = snd_ctl_layer; snd_ctl_layer = lops; up_write(&snd_ctl_layer_rwsem); for (card_number = 0; card_number < SNDRV_CARDS; card_number++) { card = snd_card_ref(card_number); if (card) { down_read(&card->controls_rwsem); lops->lregister(card); up_read(&card->controls_rwsem); snd_card_unref(card); } } } EXPORT_SYMBOL_GPL(snd_ctl_register_layer); /** * snd_ctl_disconnect_layer - disconnect control layer * @lops: operation structure * * It is expected that the information about tracked cards * is freed before this call (the disconnect callback is * not called here). */ void snd_ctl_disconnect_layer(struct snd_ctl_layer_ops *lops) { struct snd_ctl_layer_ops *lops2, *prev_lops2; down_write(&snd_ctl_layer_rwsem); for (lops2 = snd_ctl_layer, prev_lops2 = NULL; lops2; lops2 = lops2->next) { if (lops2 == lops) { if (!prev_lops2) snd_ctl_layer = lops->next; else prev_lops2->next = lops->next; break; } prev_lops2 = lops2; } up_write(&snd_ctl_layer_rwsem); } EXPORT_SYMBOL_GPL(snd_ctl_disconnect_layer); /* * INIT PART */ static const struct file_operations snd_ctl_f_ops = { .owner = THIS_MODULE, .read = snd_ctl_read, .open = snd_ctl_open, .release = snd_ctl_release, .llseek = no_llseek, .poll = snd_ctl_poll, .unlocked_ioctl = snd_ctl_ioctl, .compat_ioctl = snd_ctl_ioctl_compat, .fasync = snd_ctl_fasync, }; /* * registration of the control device */ static int snd_ctl_dev_register(struct snd_device *device) { struct snd_card *card = device->device_data; struct snd_ctl_layer_ops *lops; int err; err = snd_register_device(SNDRV_DEVICE_TYPE_CONTROL, card, -1, &snd_ctl_f_ops, card, card->ctl_dev); if (err < 0) return err; down_read(&card->controls_rwsem); down_read(&snd_ctl_layer_rwsem); for (lops = snd_ctl_layer; lops; lops = lops->next) lops->lregister(card); up_read(&snd_ctl_layer_rwsem); up_read(&card->controls_rwsem); return 0; } /* * disconnection of the control device */ static int snd_ctl_dev_disconnect(struct snd_device *device) { struct snd_card *card = device->device_data; struct snd_ctl_file *ctl; struct snd_ctl_layer_ops *lops; unsigned long flags; read_lock_irqsave(&card->ctl_files_rwlock, flags); list_for_each_entry(ctl, &card->ctl_files, list) { wake_up(&ctl->change_sleep); snd_kill_fasync(ctl->fasync, SIGIO, POLL_ERR); } read_unlock_irqrestore(&card->ctl_files_rwlock, flags); down_read(&card->controls_rwsem); down_read(&snd_ctl_layer_rwsem); for (lops = snd_ctl_layer; lops; lops = lops->next) lops->ldisconnect(card); up_read(&snd_ctl_layer_rwsem); up_read(&card->controls_rwsem); return snd_unregister_device(card->ctl_dev); } /* * free all controls */ static int snd_ctl_dev_free(struct snd_device *device) { struct snd_card *card = device->device_data; struct snd_kcontrol *control; down_write(&card->controls_rwsem); while (!list_empty(&card->controls)) { control = snd_kcontrol(card->controls.next); __snd_ctl_remove(card, control, false); } #ifdef CONFIG_SND_CTL_FAST_LOOKUP xa_destroy(&card->ctl_numids); xa_destroy(&card->ctl_hash); #endif up_write(&card->controls_rwsem); put_device(card->ctl_dev); return 0; } /* * create control core: * called from init.c */ int snd_ctl_create(struct snd_card *card) { static const struct snd_device_ops ops = { .dev_free = snd_ctl_dev_free, .dev_register = snd_ctl_dev_register, .dev_disconnect = snd_ctl_dev_disconnect, }; int err; if (snd_BUG_ON(!card)) return -ENXIO; if (snd_BUG_ON(card->number < 0 || card->number >= SNDRV_CARDS)) return -ENXIO; err = snd_device_alloc(&card->ctl_dev, card); if (err < 0) return err; dev_set_name(card->ctl_dev, "controlC%d", card->number); err = snd_device_new(card, SNDRV_DEV_CONTROL, card, &ops); if (err < 0) put_device(card->ctl_dev); return err; } /* * Frequently used control callbacks/helpers */ /** * snd_ctl_boolean_mono_info - Helper function for a standard boolean info * callback with a mono channel * @kcontrol: the kcontrol instance * @uinfo: info to store * * This is a function that can be used as info callback for a standard * boolean control with a single mono channel. * * Return: Zero (always successful) */ int snd_ctl_boolean_mono_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = 1; return 0; } EXPORT_SYMBOL(snd_ctl_boolean_mono_info); /** * snd_ctl_boolean_stereo_info - Helper function for a standard boolean info * callback with stereo two channels * @kcontrol: the kcontrol instance * @uinfo: info to store * * This is a function that can be used as info callback for a standard * boolean control with stereo two channels. * * Return: Zero (always successful) */ int snd_ctl_boolean_stereo_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN; uinfo->count = 2; uinfo->value.integer.min = 0; uinfo->value.integer.max = 1; return 0; } EXPORT_SYMBOL(snd_ctl_boolean_stereo_info); /** * snd_ctl_enum_info - fills the info structure for an enumerated control * @info: the structure to be filled * @channels: the number of the control's channels; often one * @items: the number of control values; also the size of @names * @names: an array containing the names of all control values * * Sets all required fields in @info to their appropriate values. * If the control's accessibility is not the default (readable and writable), * the caller has to fill @info->access. * * Return: Zero (always successful) */ int snd_ctl_enum_info(struct snd_ctl_elem_info *info, unsigned int channels, unsigned int items, const char *const names[]) { info->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; info->count = channels; info->value.enumerated.items = items; if (!items) return 0; if (info->value.enumerated.item >= items) info->value.enumerated.item = items - 1; WARN(strlen(names[info->value.enumerated.item]) >= sizeof(info->value.enumerated.name), "ALSA: too long item name '%s'\n", names[info->value.enumerated.item]); strscpy(info->value.enumerated.name, names[info->value.enumerated.item], sizeof(info->value.enumerated.name)); return 0; } EXPORT_SYMBOL(snd_ctl_enum_info); |
| 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CRC32C *@Article{castagnoli-crc, * author = { Guy Castagnoli and Stefan Braeuer and Martin Herrman}, * title = {{Optimization of Cyclic Redundancy-Check Codes with 24 * and 32 Parity Bits}}, * journal = IEEE Transactions on Communication, * year = {1993}, * volume = {41}, * number = {6}, * pages = {}, * month = {June}, *} * Used by the iSCSI driver, possibly others, and derived from * the iscsi-crc.c module of the linux-iscsi driver at * http://linux-iscsi.sourceforge.net. * * Following the example of lib/crc32, this function is intended to be * flexible and useful for all users. Modules that currently have their * own crc32c, but hopefully may be able to use this one are: * net/sctp (please add all your doco to here if you change to * use this one!) * <endoflist> * * Copyright (c) 2004 Cisco Systems, Inc. */ #include <crypto/hash.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/crc32c.h> static struct crypto_shash *tfm; u32 crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); u32 ret, *ctx = (u32 *)shash_desc_ctx(shash); int err; shash->tfm = tfm; *ctx = crc; err = crypto_shash_update(shash, address, length); BUG_ON(err); ret = *ctx; barrier_data(ctx); return ret; } EXPORT_SYMBOL(crc32c); static int __init libcrc32c_mod_init(void) { tfm = crypto_alloc_shash("crc32c", 0, 0); return PTR_ERR_OR_ZERO(tfm); } static void __exit libcrc32c_mod_fini(void) { crypto_free_shash(tfm); } module_init(libcrc32c_mod_init); module_exit(libcrc32c_mod_fini); MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>"); MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); |
| 2 28 28 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | #undef TRACE_SYSTEM #define TRACE_SYSTEM neigh #if !defined(_TRACE_NEIGH_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NEIGH_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <net/neighbour.h> #define neigh_state_str(state) \ __print_symbolic(state, \ { NUD_INCOMPLETE, "incomplete" }, \ { NUD_REACHABLE, "reachable" }, \ { NUD_STALE, "stale" }, \ { NUD_DELAY, "delay" }, \ { NUD_PROBE, "probe" }, \ { NUD_FAILED, "failed" }, \ { NUD_NOARP, "noarp" }, \ { NUD_PERMANENT, "permanent"}) TRACE_EVENT(neigh_create, TP_PROTO(struct neigh_table *tbl, struct net_device *dev, const void *pkey, const struct neighbour *n, bool exempt_from_gc), TP_ARGS(tbl, dev, pkey, n, exempt_from_gc), TP_STRUCT__entry( __field(u32, family) __string(dev, dev ? dev->name : "NULL") __field(int, entries) __field(u8, created) __field(u8, gc_exempt) __array(u8, primary_key4, 4) __array(u8, primary_key6, 16) ), TP_fast_assign( __be32 *p32; __entry->family = tbl->family; __assign_str(dev, (dev ? dev->name : "NULL")); __entry->entries = atomic_read(&tbl->gc_entries); __entry->created = n != NULL; __entry->gc_exempt = exempt_from_gc; p32 = (__be32 *)__entry->primary_key4; if (tbl->family == AF_INET) *p32 = *(__be32 *)pkey; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (tbl->family == AF_INET6) { struct in6_addr *pin6; pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)pkey; } #endif ), TP_printk("family %d dev %s entries %d primary_key4 %pI4 primary_key6 %pI6c created %d gc_exempt %d", __entry->family, __get_str(dev), __entry->entries, __entry->primary_key4, __entry->primary_key6, __entry->created, __entry->gc_exempt) ); TRACE_EVENT(neigh_update, TP_PROTO(struct neighbour *n, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid), TP_ARGS(n, lladdr, new, flags, nlmsg_pid), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __array(u8, new_lladdr, MAX_ADDR_LEN) __field(u8, new_state) __field(u32, update_flags) __field(u32, pid) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev, (n->dev ? n->dev->name : "NULL")); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; if (lladdr) memcpy(__entry->new_lladdr, lladdr, lladdr_len); __entry->new_state = new; __entry->update_flags = flags; __entry->pid = nlmsg_pid; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu new_lladdr %s " "new_state %s update_flags %02x pid %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __print_hex_str(__entry->new_lladdr, __entry->lladdr_len), neigh_state_str(__entry->new_state), __entry->update_flags, __entry->pid) ); DECLARE_EVENT_CLASS(neigh__update, TP_PROTO(struct neighbour *n, int err), TP_ARGS(n, err), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __field(u32, err) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev, (n->dev ? n->dev->name : "NULL")); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; __entry->err = err; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu err %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __entry->err) ); DEFINE_EVENT(neigh__update, neigh_update_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_timer_handler, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_dead, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_cleanup_and_release, TP_PROTO(struct neighbour *neigh, int rc), TP_ARGS(neigh, rc) ); #endif /* _TRACE_NEIGH_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 | /* * Copyright (c) 2007, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ #include "rds.h" /* * XXX * - build with sparse * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ /* * get the number of pages by looking at the page indices that the start and * end addresses fall in. * * Returns 0 if the vec is invalid. It is invalid if the number of bytes * causes the address to wrap or overflows an unsigned int. This comes * from being stored in the 'length' member of 'struct scatterlist'. */ static unsigned int rds_pages_in_vec(struct rds_iovec *vec) { if ((vec->addr + vec->bytes <= vec->addr) || (vec->bytes > (u64)UINT_MAX)) return 0; return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - (vec->addr >> PAGE_SHIFT); } static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, struct rds_mr *insert) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct rds_mr *mr; while (*p) { parent = *p; mr = rb_entry(parent, struct rds_mr, r_rb_node); if (key < mr->r_key) p = &(*p)->rb_left; else if (key > mr->r_key) p = &(*p)->rb_right; else return mr; } if (insert) { rb_link_node(&insert->r_rb_node, parent, p); rb_insert_color(&insert->r_rb_node, root); kref_get(&insert->r_kref); } return NULL; } /* * Destroy the transport-specific part of a MR. */ static void rds_destroy_mr(struct rds_mr *mr) { struct rds_sock *rs = mr->r_sock; void *trans_private = NULL; unsigned long flags; rdsdebug("RDS: destroy mr key is %x refcnt %u\n", mr->r_key, kref_read(&mr->r_kref)); spin_lock_irqsave(&rs->rs_rdma_lock, flags); if (!RB_EMPTY_NODE(&mr->r_rb_node)) rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); trans_private = mr->r_trans_private; mr->r_trans_private = NULL; spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (trans_private) mr->r_trans->free_mr(trans_private, mr->r_invalidate); } void __rds_put_mr_final(struct kref *kref) { struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref); rds_destroy_mr(mr); kfree(mr); } /* * By the time this is called we can't have any more ioctls called on * the socket so we don't need to worry about racing with others. */ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; unsigned long flags; /* Release any MRs associated with this socket */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = rb_entry(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); kref_put(&mr->r_kref, __rds_put_mr_final); spin_lock_irqsave(&rs->rs_rdma_lock, flags); } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); } /* * Helper function to pin user pages. */ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { unsigned int gup_flags = FOLL_LONGTERM; int ret; if (write) gup_flags |= FOLL_WRITE; ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { unpin_user_pages(pages, ret); ret = -EFAULT; } return ret; } static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, u64 *cookie_ret, struct rds_mr **mr_ret, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; unsigned int nents = 0; int need_odp = 0; long i; int ret; if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } /* If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. */ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || PAGE_ALIGN(args->vec.addr + args->vec.bytes) < (args->vec.addr + args->vec.bytes)) { ret = -EINVAL; goto out; } if (!can_do_mlock()) { ret = -EPERM; goto out; } nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; goto out; } /* Restrict the size of mr irrespective of underlying transport * To account for unaligned mr regions, subtract one from nr_pages */ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { ret = -EMSGSIZE; goto out; } rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); if (!mr) { ret = -ENOMEM; goto out; } kref_init(&mr->r_kref); RB_CLEAR_NODE(&mr->r_rb_node); mr->r_trans = rs->rs_transport; mr->r_sock = rs; if (args->flags & RDS_RDMA_USE_ONCE) mr->r_use_once = 1; if (args->flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; if (args->flags & RDS_RDMA_READWRITE) mr->r_write = 1; /* * Pin the pages that make up the user buffer and transfer the page * pointers to the mr's sg array. We check to see if we've mapped * the whole region after transferring the partial page references * to the sg array so that we can have one page ref cleanup path. * * For now we have no flag that tells us whether the mapping is * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret == -EOPNOTSUPP) { need_odp = 1; } else if (ret <= 0) { goto out; } else { nents = ret; sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) { ret = -ENOMEM; goto out; } WARN_ON(!nents); sg_init_table(sg, nents); /* Stick all pages into the scatterlist */ for (i = 0 ; i < nents; i++) sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); rdsdebug("RDS: trans_private nents is %u\n", nents); } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { /* In ODP case, we don't GUP pages, so don't need * to release anything. */ if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = PTR_ERR(trans_private); goto out; } mr->r_trans_private = trans_private; rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", mr->r_key, (void *)(unsigned long) args->cookie_addr); /* The user may pass us an unaligned address, but we can only * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing <R_Key, offset> and pass that * around. */ if (need_odp) cookie = rds_rdma_make_cookie(mr->r_key, 0); else cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = -EFAULT; goto out; } /* Inserting the new MR into the rbtree bumps its * reference count. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); BUG_ON(found && found != mr); rdsdebug("RDS: get_mr key is %x\n", mr->r_key); if (mr_ret) { kref_get(&mr->r_kref); *mr_ret = mr; } ret = 0; out: kfree(pages); if (mr) kref_put(&mr->r_kref, __rds_put_mr_final); return ret; } int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; /* * Initially, just behave like get_mr(). * TODO: Implement get_mr as wrapper around this * and deprecate it. */ new_args.vec = args.vec; new_args.cookie_addr = args.cookie_addr; new_args.flags = args.flags; return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL); } /* * Free the MR indicated by the given R_Key */ int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; unsigned long flags; if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ if (args.cookie == 0) { if (!rs->rs_transport || !rs->rs_transport->flush_mrs) return -EINVAL; rs->rs_transport->flush_mrs(); return 0; } /* Look up the MR given its R_key and remove it from the rbtree * so nobody else finds it. * This should also prevent races with rds_rdma_unuse. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); if (mr) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); if (args.flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (!mr) return -EINVAL; kref_put(&mr->r_kref, __rds_put_mr_final); return 0; } /* * This is called when we receive an extension header that * tells us this MR was used. It allows us to implement * use_once semantics */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) { struct rds_mr *mr; unsigned long flags; int zot_me = 0; spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } /* Get a reference so that the MR won't go away before calling * sync_mr() below. */ kref_get(&mr->r_kref); /* If it is going to be freed, remove it from the tree now so * that no other thread can find it and free it. */ if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); /* Release the reference held above. */ kref_put(&mr->r_kref, __rds_put_mr_final); /* If the MR was marked as invalidate, this will * trigger an async flush. */ if (zot_me) kref_put(&mr->r_kref, __rds_put_mr_final); } void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; if (ro->op_odp_mr) { kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final); } else { for (i = 0; i < ro->op_nents; i++) { struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write); } } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) { struct page *page = sg_page(ao->op_sg); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, true); kfree(ao->op_notifier); ao->op_notifier = NULL; ao->op_active = 0; } /* * Count the number of pages needed to describe an incoming iovec array. */ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) { int tot_pages = 0; unsigned int nr_pages; unsigned int i; /* figure out the number of pages in the vector */ for (i = 0; i < nr_iovecs; i++) { nr_pages = rds_pages_in_vec(&iov[i]); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages; } int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov) { struct rds_iovec *vec; struct rds_iovec __user *local_vec; int tot_pages = 0; unsigned int nr_pages; unsigned int i; local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; if (args->nr_local == 0) return -EINVAL; if (args->nr_local > UIO_MAXIOV) return -EMSGSIZE; iov->iov = kcalloc(args->nr_local, sizeof(struct rds_iovec), GFP_KERNEL); if (!iov->iov) return -ENOMEM; vec = &iov->iov[0]; if (copy_from_user(vec, local_vec, args->nr_local * sizeof(struct rds_iovec))) return -EFAULT; iov->len = args->nr_local; /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++, vec++) { nr_pages = rds_pages_in_vec(vec); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages * sizeof(struct scatterlist); } /* * The application asks for a RDMA transfer. * Extract all arguments and set up the rdma_op */ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec) { struct rds_rdma_args *args; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec *iovs; unsigned int i, j; int ret = 0; bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) return -EINVAL; args = CMSG_DATA(cmsg); if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out_ret; } if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out_ret; } if (vec->len != args->nr_local) { ret = -EINVAL; goto out_ret; } /* odp-mr is not supported for multiple requests within one message */ if (args->nr_local != 1) odp_supported = false; iovs = vec->iov; nr_pages = rds_rdma_pages(iovs, args->nr_local); if (nr_pages < 0) { ret = -EINVAL; goto out_ret; } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_ret; } op->op_write = !!(args->flags & RDS_RDMA_READWRITE); op->op_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; op->op_odp_mr = NULL; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); if (IS_ERR(op->op_sg)) { ret = PTR_ERR(op->op_sg); goto out_pages; } if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); if (!op->op_notifier) { ret = -ENOMEM; goto out_pages; } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and * optionally an offset into it. This is how we implement RDMA into * unaligned memory. * When setting up the RDMA, we need to add that offset to the * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ op->op_rkey = rds_rdma_cookie_key(args->cookie); op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, op->op_rkey); for (i = 0; i < args->nr_local; i++) { struct rds_iovec *iov = &iovs[i]; /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ unsigned int nr = rds_pages_in_vec(iov); rs->rs_user_addr = iov->addr; rs->rs_user_bytes = iov->bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if ((!odp_supported && ret <= 0) || (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; if (ret == -EOPNOTSUPP) { struct rds_mr *local_odp_mr; if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out_pages; } local_odp_mr = kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); if (!local_odp_mr) { ret = -ENOMEM; goto out_pages; } RB_CLEAR_NODE(&local_odp_mr->r_rb_node); kref_init(&local_odp_mr->r_kref); local_odp_mr->r_trans = rs->rs_transport; local_odp_mr->r_sock = rs; local_odp_mr->r_trans_private = rs->rs_transport->get_mr( NULL, 0, rs, &local_odp_mr->r_key, NULL, iov->addr, iov->bytes, ODP_VIRTUAL); if (IS_ERR(local_odp_mr->r_trans_private)) { ret = PTR_ERR(local_odp_mr->r_trans_private); rdsdebug("get_mr ret %d %p\"", ret, local_odp_mr->r_trans_private); kfree(local_odp_mr); ret = -EOPNOTSUPP; goto out_pages; } rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", local_odp_mr, local_odp_mr->r_trans_private); op->op_odp_mr = local_odp_mr; op->op_odp_addr = iov->addr; } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { unsigned int offset = iov->addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); iov->addr += sg->length; iov->bytes -= sg->length; } op->op_nents += nr; } if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, (unsigned int) args->remote_vec.bytes); ret = -EINVAL; goto out_pages; } op->op_bytes = nr_bytes; ret = 0; out_pages: kfree(pages); out_ret: if (ret) rds_rdma_free_op(op); else rds_stats_inc(s_send_rdma); return ret; } /* * The application wants us to pass an RDMA destination (aka MR) * to the remote */ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { unsigned long flags; struct rds_mr *mr; u32 r_key; int err = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || rm->m_rdma_cookie != 0) return -EINVAL; memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); /* We are reusing a previously mapped MR here. Most likely, the * application has written to the buffer, so we need to explicitly * flush those writes to RAM. Otherwise the HCA may not see them * when doing a DMA from that buffer. */ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) err = -EINVAL; /* invalid r_key */ else kref_get(&mr->r_kref); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; } /* * The application passes us an address range it wants to enable RDMA * to/from. We map the area, and save the <R_Key,offset> pair * in rm->m_rdma_cookie. This causes it to be sent along to the peer * in an extension header. */ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || rm->m_rdma_cookie != 0) return -EINVAL; return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr, rm->m_conn_path); } /* * Fill in rds_message for an atomic request. */ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct page *page = NULL; struct rds_atomic_args *args; int ret = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) || rm->atomic.op_active) return -EINVAL; args = CMSG_DATA(cmsg); /* Nonmasked & masked cmsg ops converted to masked hw ops */ switch (cmsg->cmsg_type) { case RDS_CMSG_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->fadd.add; rm->atomic.op_m_fadd.nocarry_mask = 0; break; case RDS_CMSG_MASKED_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->m_fadd.add; rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; break; case RDS_CMSG_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->cswp.compare; rm->atomic.op_m_cswp.swap = args->cswp.swap; rm->atomic.op_m_cswp.compare_mask = ~0; rm->atomic.op_m_cswp.swap_mask = ~0; break; case RDS_CMSG_MASKED_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->m_cswp.compare; rm->atomic.op_m_cswp.swap = args->m_cswp.swap; rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; break; default: BUG(); /* should never happen */ } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); if (IS_ERR(rm->atomic.op_sg)) { ret = PTR_ERR(rm->atomic.op_sg); goto err; } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { ret = -EFAULT; goto err; } ret = rds_pin_pages(args->local_addr, 1, &page, 1); if (ret != 1) goto err; ret = 0; sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); if (rm->atomic.op_notify || rm->atomic.op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); if (!rm->atomic.op_notifier) { ret = -ENOMEM; goto err; } rm->atomic.op_notifier->n_user_token = args->user_token; rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; } rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); return ret; err: if (page) unpin_user_page(page); rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; } |
| 110 47 107 39 8 73 68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BITOPS_H #define _LINUX_BITOPS_H #include <asm/types.h> #include <linux/bits.h> #include <linux/typecheck.h> #include <uapi/linux/kernel.h> /* Set bits in the first 'n' bytes when loaded from memory */ #ifdef __LITTLE_ENDIAN # define aligned_byte_mask(n) ((1UL << 8*(n))-1) #else # define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) #endif #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); /* * Defined here because those may be needed by architecture-specific static * inlines. */ #include <asm-generic/bitops/generic-non-atomic.h> /* * Many architecture-specific non-atomic bitops contain inline asm code and due * to that the compiler can't optimize them to compile-time expressions or * constants. In contrary, generic_*() helpers are defined in pure C and * compilers optimize them just well. * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when * the arguments can be resolved at compile time. That expression itself is a * constant and doesn't bring any functional changes to the rest of cases. * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when * passing a bitmap from .bss or .data (-> `!!addr` is always true). */ #define bitop(op, nr, addr) \ ((__builtin_constant_p(nr) && \ __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) && \ (uintptr_t)(addr) != (uintptr_t)NULL && \ __builtin_constant_p(*(const unsigned long *)(addr))) ? \ const##op(nr, addr) : op(nr, addr)) #define __set_bit(nr, addr) bitop(___set_bit, nr, addr) #define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) #define __change_bit(nr, addr) bitop(___change_bit, nr, addr) #define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) #define test_bit(nr, addr) bitop(_test_bit, nr, addr) #define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) /* * Include this here because some architectures need generic_ffs/fls in * scope */ #include <asm/bitops.h> /* Check that the bitops prototypes are sane */ #define __check_bitop_pr(name) \ static_assert(__same_type(arch_##name, generic_##name) && \ __same_type(const_##name, generic_##name) && \ __same_type(_##name, generic_##name)) __check_bitop_pr(__set_bit); __check_bitop_pr(__clear_bit); __check_bitop_pr(__change_bit); __check_bitop_pr(__test_and_set_bit); __check_bitop_pr(__test_and_clear_bit); __check_bitop_pr(__test_and_change_bit); __check_bitop_pr(test_bit); #undef __check_bitop_pr static inline int get_bitmask_order(unsigned int count) { int order; order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } static __always_inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** * rol64 - rotate a 64-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u64 rol64(__u64 word, unsigned int shift) { return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** * ror64 - rotate a 64-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u64 ror64(__u64 word, unsigned int shift) { return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** * rol32 - rotate a 32-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u32 rol32(__u32 word, unsigned int shift) { return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** * ror32 - rotate a 32-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u32 ror32(__u32 word, unsigned int shift) { return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** * rol16 - rotate a 16-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u16 rol16(__u16 word, unsigned int shift) { return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** * ror16 - rotate a 16-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u16 ror16(__u16 word, unsigned int shift) { return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** * rol8 - rotate an 8-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u8 rol8(__u8 word, unsigned int shift) { return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** * ror8 - rotate an 8-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u8 ror8(__u8 word, unsigned int shift) { return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<32) to sign bit * * This is safe to use for 16- and 8-bit types as well. */ static __always_inline __s32 sign_extend32(__u32 value, int index) { __u8 shift = 31 - index; return (__s32)(value << shift) >> shift; } /** * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<64) to sign bit */ static __always_inline __s64 sign_extend64(__u64 value, int index) { __u8 shift = 63 - index; return (__s64)(value << shift) >> shift; } static inline unsigned fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); return fls64(l); } static inline int get_count_order(unsigned int count) { if (count == 0) return -1; return fls(--count); } /** * get_count_order_long - get order after rounding @l up to power of 2 * @l: parameter * * it is same as get_count_order() but with long type parameter */ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; return (int)fls_long(--l); } /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word * * On 64 bit arches this is a synonym for __ffs * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ static inline unsigned long __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) return __ffs((u32)(word >> 32)) + 32; #elif BITS_PER_LONG != 64 #error BITS_PER_LONG not 32 or 64 #endif return __ffs((unsigned long)word); } /** * fns - find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int bit; while (word) { bit = __ffs(word); if (n-- == 0) return bit; __clear_bit(bit, &word); } return BITS_PER_LONG; } /** * assign_bit - Assign value to a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @value: the value to assign */ static __always_inline void assign_bit(long nr, volatile unsigned long *addr, bool value) { if (value) set_bit(nr, addr); else clear_bit(nr, addr); } static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, bool value) { if (value) __set_bit(nr, addr); else __clear_bit(nr, addr); } /** * __ptr_set_bit - Set bit in a pointer's value * @nr: the bit to set * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_set_bit(bit, &p); */ #define __ptr_set_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __set_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_clear_bit - Clear bit in a pointer's value * @nr: the bit to clear * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_clear_bit(bit, &p); */ #define __ptr_clear_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __clear_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_test_bit - Test bit in a pointer's value * @nr: the bit to test * @addr: the address of the pointer variable * * Example: * void *p = foo(); * if (__ptr_test_bit(bit, &p)) { * ... * } else { * ... * } */ #define __ptr_test_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ test_bit(nr, (unsigned long *)(addr)); \ }) #ifdef __KERNEL__ #ifndef set_mask_bits #define set_mask_bits(ptr, mask, bits) \ ({ \ const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ new__ = (old__ & ~mask__) | bits__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) #endif #ifndef bit_clear_unless #define bit_clear_unless(ptr, clear, test) \ ({ \ const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ if (old__ & test__) \ break; \ new__ = old__ & ~clear__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) #endif #endif /* __KERNEL__ */ #endif |
| 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 | // SPDX-License-Identifier: GPL-2.0 /* * platform.c - platform 'pseudo' bus for legacy devices * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * * Please see Documentation/driver-api/driver-model/platform.rst for more * information. */ #include <linux/string.h> #include <linux/platform_device.h> #include <linux/of_device.h> #include <linux/of_irq.h> #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/ioport.h> #include <linux/dma-mapping.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/pm_runtime.h> #include <linux/pm_domain.h> #include <linux/idr.h> #include <linux/acpi.h> #include <linux/clk/clk-conf.h> #include <linux/limits.h> #include <linux/property.h> #include <linux/kmemleak.h> #include <linux/types.h> #include <linux/iommu.h> #include <linux/dma-map-ops.h> #include "base.h" #include "power/power.h" /* For automatically allocated device IDs */ static DEFINE_IDA(platform_devid_ida); struct device platform_bus = { .init_name = "platform", }; EXPORT_SYMBOL_GPL(platform_bus); /** * platform_get_resource - get a resource for a device * @dev: platform device * @type: resource type * @num: resource index * * Return: a pointer to the resource or NULL on failure. */ struct resource *platform_get_resource(struct platform_device *dev, unsigned int type, unsigned int num) { u32 i; for (i = 0; i < dev->num_resources; i++) { struct resource *r = &dev->resource[i]; if (type == resource_type(r) && num-- == 0) return r; } return NULL; } EXPORT_SYMBOL_GPL(platform_get_resource); struct resource *platform_get_mem_or_io(struct platform_device *dev, unsigned int num) { u32 i; for (i = 0; i < dev->num_resources; i++) { struct resource *r = &dev->resource[i]; if ((resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) && num-- == 0) return r; } return NULL; } EXPORT_SYMBOL_GPL(platform_get_mem_or_io); #ifdef CONFIG_HAS_IOMEM /** * devm_platform_get_and_ioremap_resource - call devm_ioremap_resource() for a * platform device and get resource * * @pdev: platform device to use both for memory resource lookup as well as * resource management * @index: resource index * @res: optional output parameter to store a pointer to the obtained resource. * * Return: a pointer to the remapped memory or an ERR_PTR() encoded error code * on failure. */ void __iomem * devm_platform_get_and_ioremap_resource(struct platform_device *pdev, unsigned int index, struct resource **res) { struct resource *r; r = platform_get_resource(pdev, IORESOURCE_MEM, index); if (res) *res = r; return devm_ioremap_resource(&pdev->dev, r); } EXPORT_SYMBOL_GPL(devm_platform_get_and_ioremap_resource); /** * devm_platform_ioremap_resource - call devm_ioremap_resource() for a platform * device * * @pdev: platform device to use both for memory resource lookup as well as * resource management * @index: resource index * * Return: a pointer to the remapped memory or an ERR_PTR() encoded error code * on failure. */ void __iomem *devm_platform_ioremap_resource(struct platform_device *pdev, unsigned int index) { return devm_platform_get_and_ioremap_resource(pdev, index, NULL); } EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource); /** * devm_platform_ioremap_resource_byname - call devm_ioremap_resource for * a platform device, retrieve the * resource by name * * @pdev: platform device to use both for memory resource lookup as well as * resource management * @name: name of the resource * * Return: a pointer to the remapped memory or an ERR_PTR() encoded error code * on failure. */ void __iomem * devm_platform_ioremap_resource_byname(struct platform_device *pdev, const char *name) { struct resource *res; res = platform_get_resource_byname(pdev, IORESOURCE_MEM, name); return devm_ioremap_resource(&pdev->dev, res); } EXPORT_SYMBOL_GPL(devm_platform_ioremap_resource_byname); #endif /* CONFIG_HAS_IOMEM */ /** * platform_get_irq_optional - get an optional IRQ for a device * @dev: platform device * @num: IRQ number index * * Gets an IRQ for a platform device. Device drivers should check the return * value for errors so as to not pass a negative integer value to the * request_irq() APIs. This is the same as platform_get_irq(), except that it * does not print an error message if an IRQ can not be obtained. * * For example:: * * int irq = platform_get_irq_optional(pdev, 0); * if (irq < 0) * return irq; * * Return: non-zero IRQ number on success, negative error number on failure. */ int platform_get_irq_optional(struct platform_device *dev, unsigned int num) { int ret; #ifdef CONFIG_SPARC /* sparc does not have irqs represented as IORESOURCE_IRQ resources */ if (!dev || num >= dev->archdata.num_irqs) goto out_not_found; ret = dev->archdata.irqs[num]; goto out; #else struct fwnode_handle *fwnode = dev_fwnode(&dev->dev); struct resource *r; if (is_of_node(fwnode)) { ret = of_irq_get(to_of_node(fwnode), num); if (ret > 0 || ret == -EPROBE_DEFER) goto out; } r = platform_get_resource(dev, IORESOURCE_IRQ, num); if (is_acpi_device_node(fwnode)) { if (r && r->flags & IORESOURCE_DISABLED) { ret = acpi_irq_get(ACPI_HANDLE_FWNODE(fwnode), num, r); if (ret) goto out; } } /* * The resources may pass trigger flags to the irqs that need * to be set up. It so happens that the trigger flags for * IORESOURCE_BITS correspond 1-to-1 to the IRQF_TRIGGER* * settings. */ if (r && r->flags & IORESOURCE_BITS) { struct irq_data *irqd; irqd = irq_get_irq_data(r->start); if (!irqd) goto out_not_found; irqd_set_trigger_type(irqd, r->flags & IORESOURCE_BITS); } if (r) { ret = r->start; goto out; } /* * For the index 0 interrupt, allow falling back to GpioInt * resources. While a device could have both Interrupt and GpioInt * resources, making this fallback ambiguous, in many common cases * the device will only expose one IRQ, and this fallback * allows a common code path across either kind of resource. */ if (num == 0 && is_acpi_device_node(fwnode)) { ret = acpi_dev_gpio_irq_get(to_acpi_device_node(fwnode), num); /* Our callers expect -ENXIO for missing IRQs. */ if (ret >= 0 || ret == -EPROBE_DEFER) goto out; } #endif out_not_found: ret = -ENXIO; out: if (WARN(!ret, "0 is an invalid IRQ number\n")) return -EINVAL; return ret; } EXPORT_SYMBOL_GPL(platform_get_irq_optional); /** * platform_get_irq - get an IRQ for a device * @dev: platform device * @num: IRQ number index * * Gets an IRQ for a platform device and prints an error message if finding the * IRQ fails. Device drivers should check the return value for errors so as to * not pass a negative integer value to the request_irq() APIs. * * For example:: * * int irq = platform_get_irq(pdev, 0); * if (irq < 0) * return irq; * * Return: non-zero IRQ number on success, negative error number on failure. */ int platform_get_irq(struct platform_device *dev, unsigned int num) { int ret; ret = platform_get_irq_optional(dev, num); if (ret < 0) return dev_err_probe(&dev->dev, ret, "IRQ index %u not found\n", num); return ret; } EXPORT_SYMBOL_GPL(platform_get_irq); /** * platform_irq_count - Count the number of IRQs a platform device uses * @dev: platform device * * Return: Number of IRQs a platform device uses or EPROBE_DEFER */ int platform_irq_count(struct platform_device *dev) { int ret, nr = 0; while ((ret = platform_get_irq_optional(dev, nr)) >= 0) nr++; if (ret == -EPROBE_DEFER) return ret; return nr; } EXPORT_SYMBOL_GPL(platform_irq_count); struct irq_affinity_devres { unsigned int count; unsigned int irq[] __counted_by(count); }; static void platform_disable_acpi_irq(struct platform_device *pdev, int index) { struct resource *r; r = platform_get_resource(pdev, IORESOURCE_IRQ, index); if (r) irqresource_disabled(r, 0); } static void devm_platform_get_irqs_affinity_release(struct device *dev, void *res) { struct irq_affinity_devres *ptr = res; int i; for (i = 0; i < ptr->count; i++) { irq_dispose_mapping(ptr->irq[i]); if (is_acpi_device_node(dev_fwnode(dev))) platform_disable_acpi_irq(to_platform_device(dev), i); } } /** * devm_platform_get_irqs_affinity - devm method to get a set of IRQs for a * device using an interrupt affinity descriptor * @dev: platform device pointer * @affd: affinity descriptor * @minvec: minimum count of interrupt vectors * @maxvec: maximum count of interrupt vectors * @irqs: pointer holder for IRQ numbers * * Gets a set of IRQs for a platform device, and updates IRQ afffinty according * to the passed affinity descriptor * * Return: Number of vectors on success, negative error number on failure. */ int devm_platform_get_irqs_affinity(struct platform_device *dev, struct irq_affinity *affd, unsigned int minvec, unsigned int maxvec, int **irqs) { struct irq_affinity_devres *ptr; struct irq_affinity_desc *desc; size_t size; int i, ret, nvec; if (!affd) return -EPERM; if (maxvec < minvec) return -ERANGE; nvec = platform_irq_count(dev); if (nvec < 0) return nvec; if (nvec < minvec) return -ENOSPC; nvec = irq_calc_affinity_vectors(minvec, nvec, affd); if (nvec < minvec) return -ENOSPC; if (nvec > maxvec) nvec = maxvec; size = sizeof(*ptr) + sizeof(unsigned int) * nvec; ptr = devres_alloc(devm_platform_get_irqs_affinity_release, size, GFP_KERNEL); if (!ptr) return -ENOMEM; ptr->count = nvec; for (i = 0; i < nvec; i++) { int irq = platform_get_irq(dev, i); if (irq < 0) { ret = irq; goto err_free_devres; } ptr->irq[i] = irq; } desc = irq_create_affinity_masks(nvec, affd); if (!desc) { ret = -ENOMEM; goto err_free_devres; } for (i = 0; i < nvec; i++) { ret = irq_update_affinity_desc(ptr->irq[i], &desc[i]); if (ret) { dev_err(&dev->dev, "failed to update irq%d affinity descriptor (%d)\n", ptr->irq[i], ret); goto err_free_desc; } } devres_add(&dev->dev, ptr); kfree(desc); *irqs = ptr->irq; return nvec; err_free_desc: kfree(desc); err_free_devres: devres_free(ptr); return ret; } EXPORT_SYMBOL_GPL(devm_platform_get_irqs_affinity); /** * platform_get_resource_byname - get a resource for a device by name * @dev: platform device * @type: resource type * @name: resource name */ struct resource *platform_get_resource_byname(struct platform_device *dev, unsigned int type, const char *name) { u32 i; for (i = 0; i < dev->num_resources; i++) { struct resource *r = &dev->resource[i]; if (unlikely(!r->name)) continue; if (type == resource_type(r) && !strcmp(r->name, name)) return r; } return NULL; } EXPORT_SYMBOL_GPL(platform_get_resource_byname); static int __platform_get_irq_byname(struct platform_device *dev, const char *name) { struct resource *r; int ret; ret = fwnode_irq_get_byname(dev_fwnode(&dev->dev), name); if (ret > 0 || ret == -EPROBE_DEFER) return ret; r = platform_get_resource_byname(dev, IORESOURCE_IRQ, name); if (r) { if (WARN(!r->start, "0 is an invalid IRQ number\n")) return -EINVAL; return r->start; } return -ENXIO; } /** * platform_get_irq_byname - get an IRQ for a device by name * @dev: platform device * @name: IRQ name * * Get an IRQ like platform_get_irq(), but then by name rather then by index. * * Return: non-zero IRQ number on success, negative error number on failure. */ int platform_get_irq_byname(struct platform_device *dev, const char *name) { int ret; ret = __platform_get_irq_byname(dev, name); if (ret < 0) return dev_err_probe(&dev->dev, ret, "IRQ %s not found\n", name); return ret; } EXPORT_SYMBOL_GPL(platform_get_irq_byname); /** * platform_get_irq_byname_optional - get an optional IRQ for a device by name * @dev: platform device * @name: IRQ name * * Get an optional IRQ by name like platform_get_irq_byname(). Except that it * does not print an error message if an IRQ can not be obtained. * * Return: non-zero IRQ number on success, negative error number on failure. */ int platform_get_irq_byname_optional(struct platform_device *dev, const char *name) { return __platform_get_irq_byname(dev, name); } EXPORT_SYMBOL_GPL(platform_get_irq_byname_optional); /** * platform_add_devices - add a numbers of platform devices * @devs: array of platform devices to add * @num: number of platform devices in array * * Return: 0 on success, negative error number on failure. */ int platform_add_devices(struct platform_device **devs, int num) { int i, ret = 0; for (i = 0; i < num; i++) { ret = platform_device_register(devs[i]); if (ret) { while (--i >= 0) platform_device_unregister(devs[i]); break; } } return ret; } EXPORT_SYMBOL_GPL(platform_add_devices); struct platform_object { struct platform_device pdev; char name[]; }; /* * Set up default DMA mask for platform devices if the they weren't * previously set by the architecture / DT. */ static void setup_pdev_dma_masks(struct platform_device *pdev) { pdev->dev.dma_parms = &pdev->dma_parms; if (!pdev->dev.coherent_dma_mask) pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); if (!pdev->dev.dma_mask) { pdev->platform_dma_mask = DMA_BIT_MASK(32); pdev->dev.dma_mask = &pdev->platform_dma_mask; } }; /** * platform_device_put - destroy a platform device * @pdev: platform device to free * * Free all memory associated with a platform device. This function must * _only_ be externally called in error cases. All other usage is a bug. */ void platform_device_put(struct platform_device *pdev) { if (!IS_ERR_OR_NULL(pdev)) put_device(&pdev->dev); } EXPORT_SYMBOL_GPL(platform_device_put); static void platform_device_release(struct device *dev) { struct platform_object *pa = container_of(dev, struct platform_object, pdev.dev); of_node_put(pa->pdev.dev.of_node); kfree(pa->pdev.dev.platform_data); kfree(pa->pdev.mfd_cell); kfree(pa->pdev.resource); kfree(pa->pdev.driver_override); kfree(pa); } /** * platform_device_alloc - create a platform device * @name: base name of the device we're adding * @id: instance id * * Create a platform device object which can have other objects attached * to it, and which will have attached objects freed when it is released. */ struct platform_device *platform_device_alloc(const char *name, int id) { struct platform_object *pa; pa = kzalloc(sizeof(*pa) + strlen(name) + 1, GFP_KERNEL); if (pa) { strcpy(pa->name, name); pa->pdev.name = pa->name; pa->pdev.id = id; device_initialize(&pa->pdev.dev); pa->pdev.dev.release = platform_device_release; setup_pdev_dma_masks(&pa->pdev); } return pa ? &pa->pdev : NULL; } EXPORT_SYMBOL_GPL(platform_device_alloc); /** * platform_device_add_resources - add resources to a platform device * @pdev: platform device allocated by platform_device_alloc to add resources to * @res: set of resources that needs to be allocated for the device * @num: number of resources * * Add a copy of the resources to the platform device. The memory * associated with the resources will be freed when the platform device is * released. */ int platform_device_add_resources(struct platform_device *pdev, const struct resource *res, unsigned int num) { struct resource *r = NULL; if (res) { r = kmemdup(res, sizeof(struct resource) * num, GFP_KERNEL); if (!r) return -ENOMEM; } kfree(pdev->resource); pdev->resource = r; pdev->num_resources = num; return 0; } EXPORT_SYMBOL_GPL(platform_device_add_resources); /** * platform_device_add_data - add platform-specific data to a platform device * @pdev: platform device allocated by platform_device_alloc to add resources to * @data: platform specific data for this platform device * @size: size of platform specific data * * Add a copy of platform specific data to the platform device's * platform_data pointer. The memory associated with the platform data * will be freed when the platform device is released. */ int platform_device_add_data(struct platform_device *pdev, const void *data, size_t size) { void *d = NULL; if (data) { d = kmemdup(data, size, GFP_KERNEL); if (!d) return -ENOMEM; } kfree(pdev->dev.platform_data); pdev->dev.platform_data = d; return 0; } EXPORT_SYMBOL_GPL(platform_device_add_data); /** * platform_device_add - add a platform device to device hierarchy * @pdev: platform device we're adding * * This is part 2 of platform_device_register(), though may be called * separately _iff_ pdev was allocated by platform_device_alloc(). */ int platform_device_add(struct platform_device *pdev) { struct device *dev = &pdev->dev; u32 i; int ret; if (!dev->parent) dev->parent = &platform_bus; dev->bus = &platform_bus_type; switch (pdev->id) { default: dev_set_name(dev, "%s.%d", pdev->name, pdev->id); break; case PLATFORM_DEVID_NONE: dev_set_name(dev, "%s", pdev->name); break; case PLATFORM_DEVID_AUTO: /* * Automatically allocated device ID. We mark it as such so * that we remember it must be freed, and we append a suffix * to avoid namespace collision with explicit IDs. */ ret = ida_alloc(&platform_devid_ida, GFP_KERNEL); if (ret < 0) return ret; pdev->id = ret; pdev->id_auto = true; dev_set_name(dev, "%s.%d.auto", pdev->name, pdev->id); break; } for (i = 0; i < pdev->num_resources; i++) { struct resource *p, *r = &pdev->resource[i]; if (r->name == NULL) r->name = dev_name(dev); p = r->parent; if (!p) { if (resource_type(r) == IORESOURCE_MEM) p = &iomem_resource; else if (resource_type(r) == IORESOURCE_IO) p = &ioport_resource; } if (p) { ret = insert_resource(p, r); if (ret) { dev_err(dev, "failed to claim resource %d: %pR\n", i, r); goto failed; } } } pr_debug("Registering platform device '%s'. Parent at %s\n", dev_name(dev), dev_name(dev->parent)); ret = device_add(dev); if (ret) goto failed; return 0; failed: if (pdev->id_auto) { ida_free(&platform_devid_ida, pdev->id); pdev->id = PLATFORM_DEVID_AUTO; } while (i--) { struct resource *r = &pdev->resource[i]; if (r->parent) release_resource(r); } return ret; } EXPORT_SYMBOL_GPL(platform_device_add); /** * platform_device_del - remove a platform-level device * @pdev: platform device we're removing * * Note that this function will also release all memory- and port-based * resources owned by the device (@dev->resource). This function must * _only_ be externally called in error cases. All other usage is a bug. */ void platform_device_del(struct platform_device *pdev) { u32 i; if (!IS_ERR_OR_NULL(pdev)) { device_del(&pdev->dev); if (pdev->id_auto) { ida_free(&platform_devid_ida, pdev->id); pdev->id = PLATFORM_DEVID_AUTO; } for (i = 0; i < pdev->num_resources; i++) { struct resource *r = &pdev->resource[i]; if (r->parent) release_resource(r); } } } EXPORT_SYMBOL_GPL(platform_device_del); /** * platform_device_register - add a platform-level device * @pdev: platform device we're adding * * NOTE: _Never_ directly free @pdev after calling this function, even if it * returned an error! Always use platform_device_put() to give up the * reference initialised in this function instead. */ int platform_device_register(struct platform_device *pdev) { device_initialize(&pdev->dev); setup_pdev_dma_masks(pdev); return platform_device_add(pdev); } EXPORT_SYMBOL_GPL(platform_device_register); /** * platform_device_unregister - unregister a platform-level device * @pdev: platform device we're unregistering * * Unregistration is done in 2 steps. First we release all resources * and remove it from the subsystem, then we drop reference count by * calling platform_device_put(). */ void platform_device_unregister(struct platform_device *pdev) { platform_device_del(pdev); platform_device_put(pdev); } EXPORT_SYMBOL_GPL(platform_device_unregister); /** * platform_device_register_full - add a platform-level device with * resources and platform-specific data * * @pdevinfo: data used to create device * * Returns &struct platform_device pointer on success, or ERR_PTR() on error. */ struct platform_device *platform_device_register_full( const struct platform_device_info *pdevinfo) { int ret; struct platform_device *pdev; pdev = platform_device_alloc(pdevinfo->name, pdevinfo->id); if (!pdev) return ERR_PTR(-ENOMEM); pdev->dev.parent = pdevinfo->parent; pdev->dev.fwnode = pdevinfo->fwnode; pdev->dev.of_node = of_node_get(to_of_node(pdev->dev.fwnode)); pdev->dev.of_node_reused = pdevinfo->of_node_reused; if (pdevinfo->dma_mask) { pdev->platform_dma_mask = pdevinfo->dma_mask; pdev->dev.dma_mask = &pdev->platform_dma_mask; pdev->dev.coherent_dma_mask = pdevinfo->dma_mask; } ret = platform_device_add_resources(pdev, pdevinfo->res, pdevinfo->num_res); if (ret) goto err; ret = platform_device_add_data(pdev, pdevinfo->data, pdevinfo->size_data); if (ret) goto err; if (pdevinfo->properties) { ret = device_create_managed_software_node(&pdev->dev, pdevinfo->properties, NULL); if (ret) goto err; } ret = platform_device_add(pdev); if (ret) { err: ACPI_COMPANION_SET(&pdev->dev, NULL); platform_device_put(pdev); return ERR_PTR(ret); } return pdev; } EXPORT_SYMBOL_GPL(platform_device_register_full); /** * __platform_driver_register - register a driver for platform-level devices * @drv: platform driver structure * @owner: owning module/driver */ int __platform_driver_register(struct platform_driver *drv, struct module *owner) { drv->driver.owner = owner; drv->driver.bus = &platform_bus_type; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__platform_driver_register); /** * platform_driver_unregister - unregister a driver for platform-level devices * @drv: platform driver structure */ void platform_driver_unregister(struct platform_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(platform_driver_unregister); static int platform_probe_fail(struct platform_device *pdev) { return -ENXIO; } static int is_bound_to_driver(struct device *dev, void *driver) { if (dev->driver == driver) return 1; return 0; } /** * __platform_driver_probe - register driver for non-hotpluggable device * @drv: platform driver structure * @probe: the driver probe routine, probably from an __init section * @module: module which will be the owner of the driver * * Use this instead of platform_driver_register() when you know the device * is not hotpluggable and has already been registered, and you want to * remove its run-once probe() infrastructure from memory after the driver * has bound to the device. * * One typical use for this would be with drivers for controllers integrated * into system-on-chip processors, where the controller devices have been * configured as part of board setup. * * Note that this is incompatible with deferred probing. * * Returns zero if the driver registered and bound to a device, else returns * a negative error code and with the driver not registered. */ int __init_or_module __platform_driver_probe(struct platform_driver *drv, int (*probe)(struct platform_device *), struct module *module) { int retval; if (drv->driver.probe_type == PROBE_PREFER_ASYNCHRONOUS) { pr_err("%s: drivers registered with %s can not be probed asynchronously\n", drv->driver.name, __func__); return -EINVAL; } /* * We have to run our probes synchronously because we check if * we find any devices to bind to and exit with error if there * are any. */ drv->driver.probe_type = PROBE_FORCE_SYNCHRONOUS; /* * Prevent driver from requesting probe deferral to avoid further * futile probe attempts. */ drv->prevent_deferred_probe = true; /* make sure driver won't have bind/unbind attributes */ drv->driver.suppress_bind_attrs = true; /* temporary section violation during probe() */ drv->probe = probe; retval = __platform_driver_register(drv, module); if (retval) return retval; /* Force all new probes of this driver to fail */ drv->probe = platform_probe_fail; /* Walk all platform devices and see if any actually bound to this driver. * If not, return an error as the device should have done so by now. */ if (!bus_for_each_dev(&platform_bus_type, NULL, &drv->driver, is_bound_to_driver)) { retval = -ENODEV; platform_driver_unregister(drv); } return retval; } EXPORT_SYMBOL_GPL(__platform_driver_probe); /** * __platform_create_bundle - register driver and create corresponding device * @driver: platform driver structure * @probe: the driver probe routine, probably from an __init section * @res: set of resources that needs to be allocated for the device * @n_res: number of resources * @data: platform specific data for this platform device * @size: size of platform specific data * @module: module which will be the owner of the driver * * Use this in legacy-style modules that probe hardware directly and * register a single platform device and corresponding platform driver. * * Returns &struct platform_device pointer on success, or ERR_PTR() on error. */ struct platform_device * __init_or_module __platform_create_bundle( struct platform_driver *driver, int (*probe)(struct platform_device *), struct resource *res, unsigned int n_res, const void *data, size_t size, struct module *module) { struct platform_device *pdev; int error; pdev = platform_device_alloc(driver->driver.name, -1); if (!pdev) { error = -ENOMEM; goto err_out; } error = platform_device_add_resources(pdev, res, n_res); if (error) goto err_pdev_put; error = platform_device_add_data(pdev, data, size); if (error) goto err_pdev_put; error = platform_device_add(pdev); if (error) goto err_pdev_put; error = __platform_driver_probe(driver, probe, module); if (error) goto err_pdev_del; return pdev; err_pdev_del: platform_device_del(pdev); err_pdev_put: platform_device_put(pdev); err_out: return ERR_PTR(error); } EXPORT_SYMBOL_GPL(__platform_create_bundle); /** * __platform_register_drivers - register an array of platform drivers * @drivers: an array of drivers to register * @count: the number of drivers to register * @owner: module owning the drivers * * Registers platform drivers specified by an array. On failure to register a * driver, all previously registered drivers will be unregistered. Callers of * this API should use platform_unregister_drivers() to unregister drivers in * the reverse order. * * Returns: 0 on success or a negative error code on failure. */ int __platform_register_drivers(struct platform_driver * const *drivers, unsigned int count, struct module *owner) { unsigned int i; int err; for (i = 0; i < count; i++) { pr_debug("registering platform driver %ps\n", drivers[i]); err = __platform_driver_register(drivers[i], owner); if (err < 0) { pr_err("failed to register platform driver %ps: %d\n", drivers[i], err); goto error; } } return 0; error: while (i--) { pr_debug("unregistering platform driver %ps\n", drivers[i]); platform_driver_unregister(drivers[i]); } return err; } EXPORT_SYMBOL_GPL(__platform_register_drivers); /** * platform_unregister_drivers - unregister an array of platform drivers * @drivers: an array of drivers to unregister * @count: the number of drivers to unregister * * Unregisters platform drivers specified by an array. This is typically used * to complement an earlier call to platform_register_drivers(). Drivers are * unregistered in the reverse order in which they were registered. */ void platform_unregister_drivers(struct platform_driver * const *drivers, unsigned int count) { while (count--) { pr_debug("unregistering platform driver %ps\n", drivers[count]); platform_driver_unregister(drivers[count]); } } EXPORT_SYMBOL_GPL(platform_unregister_drivers); static const struct platform_device_id *platform_match_id( const struct platform_device_id *id, struct platform_device *pdev) { while (id->name[0]) { if (strcmp(pdev->name, id->name) == 0) { pdev->id_entry = id; return id; } id++; } return NULL; } #ifdef CONFIG_PM_SLEEP static int platform_legacy_suspend(struct device *dev, pm_message_t mesg) { struct platform_driver *pdrv = to_platform_driver(dev->driver); struct platform_device *pdev = to_platform_device(dev); int ret = 0; if (dev->driver && pdrv->suspend) ret = pdrv->suspend(pdev, mesg); return ret; } static int platform_legacy_resume(struct device *dev) { struct platform_driver *pdrv = to_platform_driver(dev->driver); struct platform_device *pdev = to_platform_device(dev); int ret = 0; if (dev->driver && pdrv->resume) ret = pdrv->resume(pdev); return ret; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_SUSPEND int platform_pm_suspend(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; if (!drv) return 0; if (drv->pm) { if (drv->pm->suspend) ret = drv->pm->suspend(dev); } else { ret = platform_legacy_suspend(dev, PMSG_SUSPEND); } return ret; } int platform_pm_resume(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; if (!drv) return 0; if (drv->pm) { if (drv->pm->resume) ret = drv->pm->resume(dev); } else { ret = platform_legacy_resume(dev); } return ret; } #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS int platform_pm_freeze(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; if (!drv) return 0; if (drv->pm) { if (drv->pm->freeze) ret = drv->pm->freeze(dev); } else { ret = platform_legacy_suspend(dev, PMSG_FREEZE); } return ret; } int platform_pm_thaw(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; if (!drv) return 0; if (drv->pm) { if (drv->pm->thaw) ret = drv->pm->thaw(dev); } else { ret = platform_legacy_resume(dev); } return ret; } int platform_pm_poweroff(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; if (!drv) return 0; if (drv->pm) { if (drv->pm->poweroff) ret = drv->pm->poweroff(dev); } else { ret = platform_legacy_suspend(dev, PMSG_HIBERNATE); } return ret; } int platform_pm_restore(struct device *dev) { struct device_driver *drv = dev->driver; int ret = 0; if (!drv) return 0; if (drv->pm) { if (drv->pm->restore) ret = drv->pm->restore(dev); } else { ret = platform_legacy_resume(dev); } return ret; } #endif /* CONFIG_HIBERNATE_CALLBACKS */ /* modalias support enables more hands-off userspace setup: * (a) environment variable lets new-style hotplug events work once system is * fully running: "modprobe $MODALIAS" * (b) sysfs attribute lets new-style coldplug recover from hotplug events * mishandled before system is fully running: "modprobe $(cat modalias)" */ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { struct platform_device *pdev = to_platform_device(dev); int len; len = of_device_modalias(dev, buf, PAGE_SIZE); if (len != -ENODEV) return len; len = acpi_device_modalias(dev, buf, PAGE_SIZE - 1); if (len != -ENODEV) return len; return sysfs_emit(buf, "platform:%s\n", pdev->name); } static DEVICE_ATTR_RO(modalias); static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", dev_to_node(dev)); } static DEVICE_ATTR_RO(numa_node); static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct platform_device *pdev = to_platform_device(dev); ssize_t len; device_lock(dev); len = sysfs_emit(buf, "%s\n", pdev->driver_override); device_unlock(dev); return len; } static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct platform_device *pdev = to_platform_device(dev); int ret; ret = driver_set_override(dev, &pdev->driver_override, buf, count); if (ret) return ret; return count; } static DEVICE_ATTR_RW(driver_override); static struct attribute *platform_dev_attrs[] = { &dev_attr_modalias.attr, &dev_attr_numa_node.attr, &dev_attr_driver_override.attr, NULL, }; static umode_t platform_dev_attrs_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, typeof(*dev), kobj); if (a == &dev_attr_numa_node.attr && dev_to_node(dev) == NUMA_NO_NODE) return 0; return a->mode; } static const struct attribute_group platform_dev_group = { .attrs = platform_dev_attrs, .is_visible = platform_dev_attrs_visible, }; __ATTRIBUTE_GROUPS(platform_dev); /** * platform_match - bind platform device to platform driver. * @dev: device. * @drv: driver. * * Platform device IDs are assumed to be encoded like this: * "<name><instance>", where <name> is a short description of the type of * device, like "pci" or "floppy", and <instance> is the enumerated * instance of the device, like '0' or '42'. Driver IDs are simply * "<name>". So, extract the <name> from the platform_device structure, * and compare it against the name of the driver. Return whether they match * or not. */ static int platform_match(struct device *dev, struct device_driver *drv) { struct platform_device *pdev = to_platform_device(dev); struct platform_driver *pdrv = to_platform_driver(drv); /* When driver_override is set, only bind to the matching driver */ if (pdev->driver_override) return !strcmp(pdev->driver_override, drv->name); /* Attempt an OF style match first */ if (of_driver_match_device(dev, drv)) return 1; /* Then try ACPI style match */ if (acpi_driver_match_device(dev, drv)) return 1; /* Then try to match against the id table */ if (pdrv->id_table) return platform_match_id(pdrv->id_table, pdev) != NULL; /* fall-back to driver name match */ return (strcmp(pdev->name, drv->name) == 0); } static int platform_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct platform_device *pdev = to_platform_device(dev); int rc; /* Some devices have extra OF data and an OF-style MODALIAS */ rc = of_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; rc = acpi_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; add_uevent_var(env, "MODALIAS=%s%s", PLATFORM_MODULE_PREFIX, pdev->name); return 0; } static int platform_probe(struct device *_dev) { struct platform_driver *drv = to_platform_driver(_dev->driver); struct platform_device *dev = to_platform_device(_dev); int ret; /* * A driver registered using platform_driver_probe() cannot be bound * again later because the probe function usually lives in __init code * and so is gone. For these drivers .probe is set to * platform_probe_fail in __platform_driver_probe(). Don't even prepare * clocks and PM domains for these to match the traditional behaviour. */ if (unlikely(drv->probe == platform_probe_fail)) return -ENXIO; ret = of_clk_set_defaults(_dev->of_node, false); if (ret < 0) return ret; ret = dev_pm_domain_attach(_dev, true); if (ret) goto out; if (drv->probe) { ret = drv->probe(dev); if (ret) dev_pm_domain_detach(_dev, true); } out: if (drv->prevent_deferred_probe && ret == -EPROBE_DEFER) { dev_warn(_dev, "probe deferral not supported\n"); ret = -ENXIO; } return ret; } static void platform_remove(struct device *_dev) { struct platform_driver *drv = to_platform_driver(_dev->driver); struct platform_device *dev = to_platform_device(_dev); if (drv->remove_new) { drv->remove_new(dev); } else if (drv->remove) { int ret = drv->remove(dev); if (ret) dev_warn(_dev, "remove callback returned a non-zero value. This will be ignored.\n"); } dev_pm_domain_detach(_dev, true); } static void platform_shutdown(struct device *_dev) { struct platform_device *dev = to_platform_device(_dev); struct platform_driver *drv; if (!_dev->driver) return; drv = to_platform_driver(_dev->driver); if (drv->shutdown) drv->shutdown(dev); } static int platform_dma_configure(struct device *dev) { struct platform_driver *drv = to_platform_driver(dev->driver); struct fwnode_handle *fwnode = dev_fwnode(dev); enum dev_dma_attr attr; int ret = 0; if (is_of_node(fwnode)) { ret = of_dma_configure(dev, to_of_node(fwnode), true); } else if (is_acpi_device_node(fwnode)) { attr = acpi_get_dma_attr(to_acpi_device_node(fwnode)); ret = acpi_dma_configure(dev, attr); } if (ret || drv->driver_managed_dma) return ret; ret = iommu_device_use_default_domain(dev); if (ret) arch_teardown_dma_ops(dev); return ret; } static void platform_dma_cleanup(struct device *dev) { struct platform_driver *drv = to_platform_driver(dev->driver); if (!drv->driver_managed_dma) iommu_device_unuse_default_domain(dev); } static const struct dev_pm_ops platform_dev_pm_ops = { SET_RUNTIME_PM_OPS(pm_generic_runtime_suspend, pm_generic_runtime_resume, NULL) USE_PLATFORM_PM_SLEEP_OPS }; struct bus_type platform_bus_type = { .name = "platform", .dev_groups = platform_dev_groups, .match = platform_match, .uevent = platform_uevent, .probe = platform_probe, .remove = platform_remove, .shutdown = platform_shutdown, .dma_configure = platform_dma_configure, .dma_cleanup = platform_dma_cleanup, .pm = &platform_dev_pm_ops, }; EXPORT_SYMBOL_GPL(platform_bus_type); static inline int __platform_match(struct device *dev, const void *drv) { return platform_match(dev, (struct device_driver *)drv); } /** * platform_find_device_by_driver - Find a platform device with a given * driver. * @start: The device to start the search from. * @drv: The device driver to look for. */ struct device *platform_find_device_by_driver(struct device *start, const struct device_driver *drv) { return bus_find_device(&platform_bus_type, start, drv, __platform_match); } EXPORT_SYMBOL_GPL(platform_find_device_by_driver); void __weak __init early_platform_cleanup(void) { } int __init platform_bus_init(void) { int error; early_platform_cleanup(); error = device_register(&platform_bus); if (error) { put_device(&platform_bus); return error; } error = bus_register(&platform_bus_type); if (error) device_unregister(&platform_bus); return error; } |
| 77 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the ICMP protocol. * * Version: @(#)icmp.h 1.0.3 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_ICMP_H #define _LINUX_ICMP_H #include <linux/skbuff.h> #include <uapi/linux/icmp.h> #include <uapi/linux/errqueue.h> static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { return (struct icmphdr *)skb_transport_header(skb); } static inline bool icmp_is_err(int type) { switch (type) { case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_REDIRECT: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: return true; } return false; } void ip_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out, int thlen, int off); #endif /* _LINUX_ICMP_H */ |
| 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Garbage Collector For AF_UNIX sockets * * Garbage Collector: * Copyright (C) Barak A. Pearlmutter. * * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. * If it doesn't work blame me, it worked when Barak sent it. * * Assumptions: * * - object w/ a bit * - free list * * Current optimizations: * * - explicit stack instead of recursion * - tail recurse on first born instead of immediate push/pop * - we gather the stuff that should not be killed into tree * and stack is just a path from root to the current pointer. * * Future optimizations: * * - don't just push entire root set; process in place * * Fixes: * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. * Cope with changing max_files. * Al Viro 11 Oct 1998 * Graph may have cycles. That is, we can send the descriptor * of foo to bar and vice versa. Current code chokes on that. * Fix: move SCM_RIGHTS ones into the separate list and then * skb_free() them all instead of doing explicit fput's. * Another problem: since fput() may block somebody may * create a new unix_socket when we are in the middle of sweep * phase. Fix: revert the logic wrt MARKED. Mark everything * upon the beginning and unmark non-junk ones. * * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS * sent to connect()'ed but still not accept()'ed sockets. * Fixed. Old code had slightly different problem here: * extra fput() in situation when we passed the descriptor via * such socket and closed it (descriptor). That would happen on * each unix_gc() until the accept(). Since the struct file in * question would go to the free list and might be reused... * That might be the reason of random oopses on filp_close() * in unrelated processes. * * AV 28 Feb 1999 * Kill the explicit allocation of stack. Now we keep the tree * with root in dummy + pointer (gc_current) to one of the nodes. * Stack is represented as path from gc_current to dummy. Unmark * now means "add to tree". Push == "make it a son of gc_current". * Pop == "move gc_current to parent". We keep only pointers to * parents (->gc_tree). * AV 1 Mar 1999 * Damn. Added missing check for ->dead in listen queues scanning. * * Miklos Szeredi 25 Jun 2007 * Reimplement with a cycle collecting algorithm. This should * solve several problems with the previous code, like being racy * wrt receive and holding up unrelated socket operations. */ #include <linux/kernel.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/mutex.h> #include <linux/wait.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/scm.h> #include <net/tcp_states.h> #include "scm.h" /* Internal data structures and random procedures: */ static LIST_HEAD(gc_candidates); static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { struct sk_buff *skb; struct sk_buff *next; spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { /* Do we have file descriptors ? */ if (UNIXCB(skb).fp) { bool hit = false; /* Process the descriptors of this socket */ int nfd = UNIXCB(skb).fp->count; struct file **fp = UNIXCB(skb).fp->fp; while (nfd--) { /* Get the socket the fd matches if it indeed does so */ struct sock *sk = unix_get_socket(*fp++); if (sk) { struct unix_sock *u = unix_sk(sk); /* Ignore non-candidates, they could * have been added to the queues after * starting the garbage collection */ if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { hit = true; func(u); } } } if (hit && hitlist != NULL) { __skb_unlink(skb, &x->sk_receive_queue); __skb_queue_tail(hitlist, skb); } } } spin_unlock(&x->sk_receive_queue.lock); } static void scan_children(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) { if (x->sk_state != TCP_LISTEN) { scan_inflight(x, func, hitlist); } else { struct sk_buff *skb; struct sk_buff *next; struct unix_sock *u; LIST_HEAD(embryos); /* For a listening socket collect the queued embryos * and perform a scan on them as well. */ spin_lock(&x->sk_receive_queue.lock); skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { u = unix_sk(skb->sk); /* An embryo cannot be in-flight, so it's safe * to use the list link. */ BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &embryos); } spin_unlock(&x->sk_receive_queue.lock); while (!list_empty(&embryos)) { u = list_entry(embryos.next, struct unix_sock, link); scan_inflight(&u->sk, func, hitlist); list_del_init(&u->link); } } } static void dec_inflight(struct unix_sock *usk) { atomic_long_dec(&usk->inflight); } static void inc_inflight(struct unix_sock *usk) { atomic_long_inc(&usk->inflight); } static void inc_inflight_move_tail(struct unix_sock *u) { atomic_long_inc(&u->inflight); /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over */ if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) list_move_tail(&u->link, &gc_candidates); } static bool gc_in_progress; #define UNIX_INFLIGHT_TRIGGER_GC 16000 void wait_for_unix_gc(void) { /* If number of inflight sockets is insane, * force a garbage collect right now. * Paired with the WRITE_ONCE() in unix_inflight(), * unix_notinflight() and gc_in_progress(). */ if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && !READ_ONCE(gc_in_progress)) unix_gc(); wait_event(unix_gc_wait, gc_in_progress == false); } /* The external entry point: unix_gc() */ void unix_gc(void) { struct sk_buff *next_skb, *skb; struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; struct list_head cursor; LIST_HEAD(not_cycle_list); spin_lock(&unix_gc_lock); /* Avoid a recursive GC. */ if (gc_in_progress) goto out; /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, true); /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. * * Holding unix_gc_lock will protect these candidates from * being detached, and hence from gaining an external * reference. Since there are no possible receivers, all * buffers currently on the candidates' queues stay there * during the garbage collection. * * We also know that no new candidate can be added onto the * receive queues. Other, non candidate sockets _can_ be * added to queue, so we must make sure only to touch * candidates. */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; long inflight_refs; total_refs = file_count(u->sk.sk_socket->file); inflight_refs = atomic_long_read(&u->inflight); BUG_ON(inflight_refs < 1); BUG_ON(total_refs < inflight_refs); if (total_refs == inflight_refs) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); } } /* Now remove all internal in-flight reference to children of * the candidates. */ list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, dec_inflight, NULL); /* Restore the references for children of all candidates, * which have remaining references. Do this recursively, so * only those remain, which form cyclic references. * * Use a "cursor" link, to make the list traversal safe, even * though elements might be moved about. */ list_add(&cursor, &gc_candidates); while (cursor.next != &gc_candidates) { u = list_entry(cursor.next, struct unix_sock, link); /* Move cursor to after the current position. */ list_move(&cursor, &u->link); if (atomic_long_read(&u->inflight) > 0) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); } } list_del(&cursor); /* Now gc_candidates contains only garbage. Restore original * inflight counters for these as well, and remove the skbuffs * which are creating the cycle(s). */ skb_queue_head_init(&hitlist); list_for_each_entry(u, &gc_candidates, link) scan_children(&u->sk, inc_inflight, &hitlist); /* not_cycle_list contains those sockets which do not make up a * cycle. Restore these to the inflight list. */ while (!list_empty(¬_cycle_list)) { u = list_entry(not_cycle_list.next, struct unix_sock, link); __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); list_move_tail(&u->link, &gc_inflight_list); } spin_unlock(&unix_gc_lock); /* We need io_uring to clean its registered files, ignore all io_uring * originated skbs. It's fine as io_uring doesn't keep references to * other io_uring instances and so killing all other files in the cycle * will put all io_uring references forcing it to go through normal * release.path eventually putting registered files. */ skb_queue_walk_safe(&hitlist, skb, next_skb) { if (skb->destructor == io_uring_destruct_scm) { __skb_unlink(skb, &hitlist); skb_queue_tail(&skb->sk->sk_receive_queue, skb); } } /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); /* There could be io_uring registered files, just push them back to * the inflight list */ list_for_each_entry_safe(u, next, &gc_candidates, link) list_move_tail(&u->link, &gc_inflight_list); /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); wake_up(&unix_gc_wait); out: spin_unlock(&unix_gc_lock); } |
| 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 | // SPDX-License-Identifier: GPL-2.0-or-later /* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "CRED: " fmt #include <linux/export.h> #include <linux/cred.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/coredump.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/init_task.h> #include <linux/security.h> #include <linux/binfmts.h> #include <linux/cn_proc.h> #include <linux/uidgid.h> #if 0 #define kdebug(FMT, ...) \ printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__) #else #define kdebug(FMT, ...) \ do { \ if (0) \ no_printk("[%-5.5s%5u] " FMT "\n", \ current->comm, current->pid, ##__VA_ARGS__); \ } while (0) #endif static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; /* * The initial credentials for the initial task */ struct cred init_cred = { .usage = ATOMIC_INIT(4), .uid = GLOBAL_ROOT_UID, .gid = GLOBAL_ROOT_GID, .suid = GLOBAL_ROOT_UID, .sgid = GLOBAL_ROOT_GID, .euid = GLOBAL_ROOT_UID, .egid = GLOBAL_ROOT_GID, .fsuid = GLOBAL_ROOT_UID, .fsgid = GLOBAL_ROOT_GID, .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, .cap_effective = CAP_FULL_SET, .cap_bset = CAP_FULL_SET, .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, .ucounts = &init_ucounts, }; /* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) { struct cred *cred = container_of(rcu, struct cred, rcu); kdebug("put_cred_rcu(%p)", cred); if (atomic_long_read(&cred->usage) != 0) panic("CRED: put_cred_rcu() sees %p with usage %ld\n", cred, atomic_long_read(&cred->usage)); security_cred_free(cred); key_put(cred->session_keyring); key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); if (cred->ucounts) put_ucounts(cred->ucounts); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } /** * __put_cred - Destroy a set of credentials * @cred: The record to release * * Destroy a set of credentials on which no references remain. */ void __put_cred(struct cred *cred) { kdebug("__put_cred(%p{%ld})", cred, atomic_long_read(&cred->usage)); BUG_ON(atomic_long_read(&cred->usage) != 0); BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); if (cred->non_rcu) put_cred_rcu(&cred->rcu); else call_rcu(&cred->rcu, put_cred_rcu); } EXPORT_SYMBOL(__put_cred); /* * Clean up a task's credentials when it exits */ void exit_creds(struct task_struct *tsk) { struct cred *real_cred, *cred; kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred, atomic_long_read(&tsk->cred->usage)); real_cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; cred = (struct cred *) tsk->cred; tsk->cred = NULL; if (real_cred == cred) { put_cred_many(cred, 2); } else { put_cred(real_cred); put_cred(cred); } #ifdef CONFIG_KEYS_REQUEST_CACHE key_put(tsk->cached_requested_key); tsk->cached_requested_key = NULL; #endif } /** * get_task_cred - Get another task's objective credentials * @task: The task to query * * Get the objective credentials of a task, pinning them so that they can't go * away. Accessing a task's credentials directly is not permitted. * * The caller must also make sure task doesn't get deleted, either by holding a * ref on task or by holding tasklist_lock to prevent it from being unlinked. */ const struct cred *get_task_cred(struct task_struct *task) { const struct cred *cred; rcu_read_lock(); do { cred = __task_cred((task)); BUG_ON(!cred); } while (!get_cred_rcu(cred)); rcu_read_unlock(); return cred; } EXPORT_SYMBOL(get_task_cred); /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. */ struct cred *cred_alloc_blank(void) { struct cred *new; new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); if (!new) return NULL; atomic_long_set(&new->usage, 1); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } /** * prepare_creds - Prepare a new set of credentials for modification * * Prepare a new set of task credentials for modification. A task's creds * shouldn't generally be modified directly, therefore this function is used to * prepare a new copy, which the caller then modifies and then commits by * calling commit_creds(). * * Preparation involves making a copy of the objective creds for modification. * * Returns a pointer to the new creds-to-be if successful, NULL otherwise. * * Call commit_creds() or abort_creds() to clean up. */ struct cred *prepare_creds(void) { struct task_struct *task = current; const struct cred *old; struct cred *new; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_creds() alloc %p", new); old = task->cred; memcpy(new, old, sizeof(struct cred)); new->non_rcu = 0; atomic_long_set(&new->usage, 1); get_group_info(new->group_info); get_uid(new->user); get_user_ns(new->user_ns); #ifdef CONFIG_KEYS key_get(new->session_keyring); key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; return new; error: abort_creds(new); return NULL; } EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { struct cred *new; new = prepare_creds(); if (!new) return new; #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; /* inherit the session keyring; new process keyring */ key_put(new->process_keyring); new->process_keyring = NULL; #endif new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; return new; } /* * Copy credentials for the new process created by fork() * * We share if we can, but under some circumstances we have to generate a new * set. * * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { struct cred *new; int ret; #ifdef CONFIG_KEYS_REQUEST_CACHE p->cached_requested_key = NULL; #endif if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && #endif clone_flags & CLONE_THREAD ) { p->real_cred = get_cred_many(p->cred, 2); kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } new = prepare_creds(); if (!new) return -ENOMEM; if (clone_flags & CLONE_NEWUSER) { ret = create_user_ns(new); if (ret < 0) goto error_put; ret = set_cred_ucounts(new); if (ret < 0) goto error_put; } #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ if (new->thread_keyring) { key_put(new->thread_keyring); new->thread_keyring = NULL; if (clone_flags & CLONE_THREAD) install_thread_keyring_to_cred(new); } /* The process keyring is only shared between the threads in a process; * anything outside of those threads doesn't inherit. */ if (!(clone_flags & CLONE_THREAD)) { key_put(new->process_keyring); new->process_keyring = NULL; } #endif p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; error_put: put_cred(new); return ret; } static bool cred_cap_issubset(const struct cred *set, const struct cred *subset) { const struct user_namespace *set_ns = set->user_ns; const struct user_namespace *subset_ns = subset->user_ns; /* If the two credentials are in the same user namespace see if * the capabilities of subset are a subset of set. */ if (set_ns == subset_ns) return cap_issubset(subset->cap_permitted, set->cap_permitted); /* The credentials are in a different user namespaces * therefore one is a subset of the other only if a set is an * ancestor of subset and set->euid is owner of subset or one * of subsets ancestors. */ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) { if ((set_ns == subset_ns->parent) && uid_eq(subset_ns->owner, set->euid)) return true; } return false; } /** * commit_creds - Install new credentials upon the current task * @new: The credentials to be assigned * * Install a new set of credentials to the current task, using RCU to replace * the old set. Both the objective and the subjective credentials pointers are * updated. This function may not be called if the subjective credentials are * in an overridden state. * * This function eats the caller's reference to the new credentials. * * Always returns 0 thus allowing this function to be tail-called at the end * of, say, sys_setgid(). */ int commit_creds(struct cred *new) { struct task_struct *task = current; const struct cred *old = task->real_cred; kdebug("commit_creds(%p{%ld})", new, atomic_long_read(&new->usage)); BUG_ON(task->cred != old); BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ if (!uid_eq(old->euid, new->euid) || !gid_eq(old->egid, new->egid) || !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || !cred_cap_issubset(old, new)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; /* * If a task drops privileges and becomes nondumpable, * the dumpability change must become visible before * the credential change; otherwise, a __ptrace_may_access() * racing with this change may be able to attach to a task it * shouldn't be able to attach to (as if the task had dropped * privileges without becoming nondumpable). * Pairs with a read barrier in __ptrace_may_access(). */ smp_wmb(); } /* alter the thread keyring */ if (!uid_eq(new->fsuid, old->fsuid)) key_fsuid_changed(new); if (!gid_eq(new->fsgid, old->fsgid)) key_fsgid_changed(new); /* do it * RLIMIT_NPROC limits on user->processes have already been checked * in set_user(). */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); /* send notifications */ if (!uid_eq(new->uid, old->uid) || !uid_eq(new->euid, old->euid) || !uid_eq(new->suid, old->suid) || !uid_eq(new->fsuid, old->fsuid)) proc_id_connector(task, PROC_EVENT_UID); if (!gid_eq(new->gid, old->gid) || !gid_eq(new->egid, old->egid) || !gid_eq(new->sgid, old->sgid) || !gid_eq(new->fsgid, old->fsgid)) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ put_cred_many(old, 2); return 0; } EXPORT_SYMBOL(commit_creds); /** * abort_creds - Discard a set of credentials and unlock the current task * @new: The credentials that were going to be applied * * Discard a set of credentials that were under construction and unlock the * current task. */ void abort_creds(struct cred *new) { kdebug("abort_creds(%p{%ld})", new, atomic_long_read(&new->usage)); BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } EXPORT_SYMBOL(abort_creds); /** * override_creds - Override the current process's subjective credentials * @new: The credentials to be assigned * * Install a set of temporary override subjective credentials on the current * process, returning the old set for later reversion. */ const struct cred *override_creds(const struct cred *new) { const struct cred *old = current->cred; kdebug("override_creds(%p{%ld})", new, atomic_long_read(&new->usage)); /* * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. * * That means that we do not clear the 'non_rcu' flag, since * we are only installing the cred into the thread-synchronous * '->cred' pointer, not the '->real_cred' pointer that is * visible to other threads under RCU. */ get_new_cred((struct cred *)new); rcu_assign_pointer(current->cred, new); kdebug("override_creds() = %p{%ld}", old, atomic_long_read(&old->usage)); return old; } EXPORT_SYMBOL(override_creds); /** * revert_creds - Revert a temporary subjective credentials override * @old: The credentials to be restored * * Revert a temporary set of override subjective credentials to an old set, * discarding the override set. */ void revert_creds(const struct cred *old) { const struct cred *override = current->cred; kdebug("revert_creds(%p{%ld})", old, atomic_long_read(&old->usage)); rcu_assign_pointer(current->cred, old); put_cred(override); } EXPORT_SYMBOL(revert_creds); /** * cred_fscmp - Compare two credentials with respect to filesystem access. * @a: The first credential * @b: The second credential * * cred_cmp() will return zero if both credentials have the same * fsuid, fsgid, and supplementary groups. That is, if they will both * provide the same access to files based on mode/uid/gid. * If the credentials are different, then either -1 or 1 will * be returned depending on whether @a comes before or after @b * respectively in an arbitrary, but stable, ordering of credentials. * * Return: -1, 0, or 1 depending on comparison */ int cred_fscmp(const struct cred *a, const struct cred *b) { struct group_info *ga, *gb; int g; if (a == b) return 0; if (uid_lt(a->fsuid, b->fsuid)) return -1; if (uid_gt(a->fsuid, b->fsuid)) return 1; if (gid_lt(a->fsgid, b->fsgid)) return -1; if (gid_gt(a->fsgid, b->fsgid)) return 1; ga = a->group_info; gb = b->group_info; if (ga == gb) return 0; if (ga == NULL) return -1; if (gb == NULL) return 1; if (ga->ngroups < gb->ngroups) return -1; if (ga->ngroups > gb->ngroups) return 1; for (g = 0; g < ga->ngroups; g++) { if (gid_lt(ga->gid[g], gb->gid[g])) return -1; if (gid_gt(ga->gid[g], gb->gid[g])) return 1; } return 0; } EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { struct ucounts *new_ucounts, *old_ucounts = new->ucounts; /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; put_ucounts(old_ucounts); return 0; } /* * initialise the credentials stuff */ void __init cred_init(void) { /* allocate a slab in which we can store credentials */ cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); } /** * prepare_kernel_cred - Prepare a set of credentials for a kernel service * @daemon: A userspace daemon to be used as a reference * * Prepare a set of credentials for a kernel service. This can then be used to * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * * @daemon is used to provide a base cred, with the security data derived from * that; if this is "&init_task", they'll be set to 0, no groups, full * capabilities, and no keys. * * The caller may change these controls afterwards if desired. * * Returns the new credentials or NULL if out of memory. */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { const struct cred *old; struct cred *new; if (WARN_ON_ONCE(!daemon)) return NULL; new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); old = get_task_cred(daemon); *new = *old; new->non_rcu = 0; atomic_long_set(&new->usage, 1); get_uid(new->user); get_user_ns(new->user_ns); get_group_info(new->group_info); #ifdef CONFIG_KEYS new->session_keyring = NULL; new->process_keyring = NULL; new->thread_keyring = NULL; new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif #ifdef CONFIG_SECURITY new->security = NULL; #endif new->ucounts = get_ucounts(new->ucounts); if (!new->ucounts) goto error; if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; put_cred(old); return new; error: put_cred(new); put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); /** * set_security_override - Set the security ID in a set of credentials * @new: The credentials to alter * @secid: The LSM security ID to set * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. */ int set_security_override(struct cred *new, u32 secid) { return security_kernel_act_as(new, secid); } EXPORT_SYMBOL(set_security_override); /** * set_security_override_from_ctx - Set the security ID in a set of credentials * @new: The credentials to alter * @secctx: The LSM security context to generate the security ID from. * * Set the LSM security ID in a set of credentials so that the subjective * security is overridden when an alternative set of credentials is used. The * security ID is specified in string form as a security context to be * interpreted by the LSM. */ int set_security_override_from_ctx(struct cred *new, const char *secctx) { u32 secid; int ret; ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); if (ret < 0) return ret; return set_security_override(new, secid); } EXPORT_SYMBOL(set_security_override_from_ctx); /** * set_create_files_as - Set the LSM file create context in a set of credentials * @new: The credentials to alter * @inode: The inode to take the context from * * Change the LSM file creation context in a set of credentials to be the same * as the object context of the specified inode, so that the new inodes have * the same MAC context as that inode. */ int set_create_files_as(struct cred *new, struct inode *inode) { if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) return -EINVAL; new->fsuid = inode->i_uid; new->fsgid = inode->i_gid; return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as); |
| 129 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fs-verity: read-only file-based authenticity protection * * This header declares the interface between the fs/verity/ support layer and * filesystems that support fs-verity. * * Copyright 2019 Google LLC */ #ifndef _LINUX_FSVERITY_H #define _LINUX_FSVERITY_H #include <linux/fs.h> #include <linux/mm.h> #include <crypto/hash_info.h> #include <crypto/sha2.h> #include <uapi/linux/fsverity.h> /* * Largest digest size among all hash algorithms supported by fs-verity. * Currently assumed to be <= size of fsverity_descriptor::root_hash. */ #define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 /* Verity operations for filesystems */ struct fsverity_operations { /** * Begin enabling verity on the given file. * * @filp: a readonly file descriptor for the file * * The filesystem must do any needed filesystem-specific preparations * for enabling verity, e.g. evicting inline data. It also must return * -EBUSY if verity is already being enabled on the given file. * * i_rwsem is held for write. * * Return: 0 on success, -errno on failure */ int (*begin_enable_verity)(struct file *filp); /** * End enabling verity on the given file. * * @filp: a readonly file descriptor for the file * @desc: the verity descriptor to write, or NULL on failure * @desc_size: size of verity descriptor, or 0 on failure * @merkle_tree_size: total bytes the Merkle tree took up * * If desc == NULL, then enabling verity failed and the filesystem only * must do any necessary cleanups. Else, it must also store the given * verity descriptor to a fs-specific location associated with the inode * and do any fs-specific actions needed to mark the inode as a verity * inode, e.g. setting a bit in the on-disk inode. The filesystem is * also responsible for setting the S_VERITY flag in the VFS inode. * * i_rwsem is held for write, but it may have been dropped between * ->begin_enable_verity() and ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*end_enable_verity)(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size); /** * Get the verity descriptor of the given inode. * * @inode: an inode with the S_VERITY flag set * @buf: buffer in which to place the verity descriptor * @bufsize: size of @buf, or 0 to retrieve the size only * * If bufsize == 0, then the size of the verity descriptor is returned. * Otherwise the verity descriptor is written to 'buf' and its actual * size is returned; -ERANGE is returned if it's too large. This may be * called by multiple processes concurrently on the same inode. * * Return: the size on success, -errno on failure */ int (*get_verity_descriptor)(struct inode *inode, void *buf, size_t bufsize); /** * Read a Merkle tree page of the given inode. * * @inode: the inode * @index: 0-based index of the page within the Merkle tree * @num_ra_pages: The number of Merkle tree pages that should be * prefetched starting at @index if the page at @index * isn't already cached. Implementations may ignore this * argument; it's only a performance optimization. * * This can be called at any time on an open verity file. It may be * called by multiple processes concurrently, even with the same page. * * Note that this must retrieve a *page*, not necessarily a *block*. * * Return: the page on success, ERR_PTR() on failure */ struct page *(*read_merkle_tree_page)(struct inode *inode, pgoff_t index, unsigned long num_ra_pages); /** * Write a Merkle tree block to the given inode. * * @inode: the inode for which the Merkle tree is being built * @buf: the Merkle tree block to write * @pos: the position of the block in the Merkle tree (in bytes) * @size: the Merkle tree block size (in bytes) * * This is only called between ->begin_enable_verity() and * ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*write_merkle_tree_block)(struct inode *inode, const void *buf, u64 pos, unsigned int size); }; #ifdef CONFIG_FS_VERITY static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fsverity_set_info(). * I.e., another task may publish ->i_verity_info concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_verity_info); } /* enable.c */ int fsverity_ioctl_enable(struct file *filp, const void __user *arg); /* measure.c */ int fsverity_ioctl_measure(struct file *filp, void __user *arg); int fsverity_get_digest(struct inode *inode, u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], u8 *alg, enum hash_algo *halg); /* open.c */ int __fsverity_file_open(struct inode *inode, struct file *filp); int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); void __fsverity_cleanup_inode(struct inode *inode); /** * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * * Filesystems must call this on inode eviction to free ->i_verity_info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { if (inode->i_verity_info) __fsverity_cleanup_inode(inode); } /* read_metadata.c */ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); /* verify.c */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset); void fsverity_verify_bio(struct bio *bio); void fsverity_enqueue_verify_work(struct work_struct *work); #else /* !CONFIG_FS_VERITY */ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { return NULL; } /* enable.c */ static inline int fsverity_ioctl_enable(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } /* measure.c */ static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fsverity_get_digest(struct inode *inode, u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], u8 *alg, enum hash_algo *halg) { /* * fsverity is not enabled in the kernel configuration, so always report * that the file doesn't have fsverity enabled (digest size 0). */ return 0; } /* open.c */ static inline int __fsverity_file_open(struct inode *inode, struct file *filp) { return -EOPNOTSUPP; } static inline int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline void fsverity_cleanup_inode(struct inode *inode) { } /* read_metadata.c */ static inline int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) { return -EOPNOTSUPP; } /* verify.c */ static inline bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { WARN_ON_ONCE(1); return false; } static inline void fsverity_verify_bio(struct bio *bio) { WARN_ON_ONCE(1); } static inline void fsverity_enqueue_verify_work(struct work_struct *work) { WARN_ON_ONCE(1); } #endif /* !CONFIG_FS_VERITY */ static inline bool fsverity_verify_folio(struct folio *folio) { return fsverity_verify_blocks(folio, folio_size(folio), 0); } static inline bool fsverity_verify_page(struct page *page) { return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0); } /** * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * * This checks whether ->i_verity_info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) * * Return: true if reads need to go through fs-verity, otherwise false */ static inline bool fsverity_active(const struct inode *inode) { return fsverity_get_info(inode) != NULL; } /** * fsverity_file_open() - prepare to open a verity file * @inode: the inode being opened * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, * set up the inode's ->i_verity_info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. * * Return: 0 on success, -errno on failure */ static inline int fsverity_file_open(struct inode *inode, struct file *filp) { if (IS_VERITY(inode)) return __fsverity_file_open(inode, filp); return 0; } /** * fsverity_prepare_setattr() - prepare to change a verity inode's attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Verity files are immutable, so deny truncates. This isn't covered by the * open-time check because sys_truncate() takes a path, not a file descriptor. * * Return: 0 on success, -errno on failure */ static inline int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_VERITY(d_inode(dentry))) return __fsverity_prepare_setattr(dentry, attr); return 0; } #endif /* _LINUX_FSVERITY_H */ |
| 61 1 61 1 223 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IP, TCP, UDP and so on * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. */ #ifndef _CHECKSUM_H #define _CHECKSUM_H #include <linux/errno.h> #include <asm/types.h> #include <asm/byteorder.h> #include <asm/checksum.h> #if !defined(_HAVE_ARCH_COPY_AND_CSUM_FROM_USER) || !defined(HAVE_CSUM_COPY_USER) #include <linux/uaccess.h> #endif #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { if (copy_from_user(dst, src, len)) return 0; return csum_partial(dst, len, ~0U); } #endif #ifndef HAVE_CSUM_COPY_USER static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); if (copy_to_user(dst, src, len) == 0) return sum; return 0; } #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); return csum_partial(dst, len, 0); } #endif #ifndef HAVE_ARCH_CSUM_ADD static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; return (__force __wsum)(res + (res < (__force u32)addend)); } #endif static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; res += (__force u16)addend; return (__force __sum16)(res + (res < (__force u16)addend)); } static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } #ifndef HAVE_ARCH_CSUM_SHIFT static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) return (__force __wsum)ror32((__force u32)sum, 8); return sum; } #endif static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { return csum_add(csum, csum_shift(csum2, offset)); } static __always_inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } static __always_inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); *sum = csum_fold(csum_add(tmp, (__force __wsum)to)); } /* Implements RFC 1624 (Incremental Internet Checksum) * 3. Discussion states : * HC' = ~(~HC + ~m + m') * m : old value of a 16bit field * m' : new value of a 16bit field */ static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) { *csum = csum_add(csum_sub(*csum, old), new); } struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); static __always_inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; /* Subtract out checksum up to start */ csum = csum_sub(csum, csum_partial(ptr, start, 0)); /* Set derived checksum in packet */ delta = csum_sub((__force __wsum)csum_fold(csum), (__force __wsum)*psum); *psum = csum_fold(csum); return delta; } static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } static __always_inline __wsum wsum_negate(__wsum val) { return (__force __wsum)-((__force u32)val); } #endif |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * inet6 interface/address list definitions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IF_INET6_H #define _NET_IF_INET6_H #include <net/snmp.h> #include <linux/ipv6.h> #include <linux/refcount.h> /* inet6_dev.if_flags */ #define IF_RA_OTHERCONF 0x80 #define IF_RA_MANAGED 0x40 #define IF_RA_RCVD 0x20 #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, INET6_IFADDR_STATE_POSTDAD, INET6_IFADDR_STATE_ERRDAD, INET6_IFADDR_STATE_DEAD, }; struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; __u32 prefered_lft; refcount_t refcnt; spinlock_t lock; int state; __u32 flags; __u8 dad_probes; __u8 stable_privacy_retry; __u16 scope; __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ struct delayed_work dad_work; struct inet6_dev *idev; struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; /* * Used to safely traverse idev->addr_list in process context * if the idev->lock needed to protect idev->addr_list cannot be held. * In that case, add the items to this list temporarily and iterate * without holding idev->lock. * See addrconf_ifdown and dev_forward_change. */ struct list_head if_list_aux; struct list_head tmp_list; struct inet6_ifaddr *ifpub; int regen_count; bool tokenized; u8 ifa_proto; struct rcu_head rcu; struct in6_addr peer_addr; }; struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; struct in6_addr sl_addr[] __counted_by(sl_max); }; #define IP6_SFBLOCK 10 /* allocate this many at once */ struct ipv6_mc_socklist { struct in6_addr addr; int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; struct ip6_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip6_sf_list { struct ip6_sf_list __rcu *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ struct rcu_head rcu; }; #define MAF_TIMER_RUNNING 0x01 #define MAF_LAST_REPORTER 0x02 #define MAF_LOADED 0x04 #define MAF_NOREPORT 0x08 #define MAF_GSQUERY 0x10 struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; struct ifmcaddr6 __rcu *next; struct ip6_sf_list __rcu *mca_sources; struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; struct delayed_work mca_work; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; }; /* Anycast stuff */ struct ipv6_ac_socklist { struct in6_addr acl_addr; int acl_ifindex; struct ipv6_ac_socklist *acl_next; }; struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; unsigned long aca_cstamp; unsigned long aca_tstamp; struct rcu_head rcu; }; #define IFA_HOST IPV6_ADDR_LOOPBACK #define IFA_LINK IPV6_ADDR_LINKLOCAL #define IFA_SITE IPV6_ADDR_SITELOCAL struct ipv6_devstat { struct proc_dir_entry *proc_dir_entry; DEFINE_SNMP_STAT(struct ipstats_mib, ipv6); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev); }; struct inet6_dev { struct net_device *dev; netdevice_tracker dev_tracker; struct list_head addr_list; struct ifmcaddr6 __rcu *mc_list; struct ifmcaddr6 __rcu *mc_tomb; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; unsigned char mc_ifc_count; unsigned char mc_dad_count; unsigned long mc_v1_seen; /* Max time we stay in MLDv1 mode */ unsigned long mc_qi; /* Query Interval */ unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; struct delayed_work mc_gq_work; /* general query work */ struct delayed_work mc_ifc_work; /* interface change work */ struct delayed_work mc_dad_work; /* dad complete mc work */ struct delayed_work mc_query_work; /* mld query work */ struct delayed_work mc_report_work; /* mld report work */ struct sk_buff_head mc_query_queue; /* mld query queue */ struct sk_buff_head mc_report_queue; /* mld report queue */ spinlock_t mc_query_lock; /* mld query queue lock */ spinlock_t mc_report_lock; /* mld query report lock */ struct mutex mc_lock; /* mld global lock */ struct ifacaddr6 *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; int dead; u32 desync_factor; struct list_head tempaddr_list; struct in6_addr token; struct neigh_parms *nd_parms; struct ipv6_devconf cnf; struct ipv6_devstat stats; struct timer_list rs_timer; __s32 rs_interval; /* in jiffies */ __u8 rs_probes; unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; unsigned int ra_mtu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) { /* * +-------+-------+-------+-------+-------+-------+ * | 33 | 33 | DST13 | DST14 | DST15 | DST16 | * +-------+-------+-------+-------+-------+-------+ */ buf[0]= 0x33; buf[1]= 0x33; memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32)); } static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf) { buf[0] = 0x00; } static inline void ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x60; /* IPv6 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; memcpy(buf + 10, addr->s6_addr + 6, 10); } static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) { memcpy(buf, broadcast, 4); } else { /* v4mapped? */ if ((addr->s6_addr32[0] | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0) return -EINVAL; memcpy(buf, &addr->s6_addr32[3], 4); } return 0; } #endif |
| 40 182 186 199 136 137 6 40 40 40 40 40 40 246 247 197 197 4 246 244 245 48 48 47 48 48 48 48 47 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/export.h> #include <linux/init.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/sctp/checksum.h> #include <linux/netfilter.h> #include <net/netfilter/nf_nat.h> #include <linux/ipv6.h> #include <linux/netfilter_ipv6.h> #include <net/checksum.h> #include <net/ip6_checksum.h> #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/nfnetlink_conntrack.h> static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype); static void __udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, struct udphdr *hdr, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, bool do_csum) { __be16 *portptr, newport; if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.udp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } if (do_csum) { nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } *portptr = newport; } static bool udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check); return true; } static bool udplite_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, true); #endif return true; } static bool sctp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct sctphdr *hdr; int hdrsize = 8; /* This could be an inner header returned in imcp packet; in such * cases we cannot update the checksum field since it is outside * of the 8 bytes of transport layer headers we are guaranteed. */ if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ hdr->source = tuple->src.u.sctp.port; } else { /* Get rid of dst port */ hdr->dest = tuple->dst.u.sctp.port; } if (hdrsize < sizeof(*hdr)) return true; if (skb->ip_summed != CHECKSUM_PARTIAL) { hdr->checksum = sctp_compute_cksum(skb, hdroff); skb->ip_summed = CHECKSUM_NONE; } #endif return true; } static bool tcp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct tcphdr *hdr; __be16 *portptr, newport, oldport; int hdrsize = 8; /* TCP connection tracking guarantees this much */ /* this could be a inner header returned in icmp packet; in such cases we cannot update the checksum field since it is outside of the 8 bytes of transport layer headers we are guaranteed */ if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.tcp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.tcp.port; portptr = &hdr->dest; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); return true; } static bool dccp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_DCCP struct dccp_hdr *hdr; __be16 *portptr, oldport, newport; int hdrsize = 8; /* DCCP connection tracking guarantees this much */ if (skb->len >= hdroff + sizeof(struct dccp_hdr)) hdrsize = sizeof(struct dccp_hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct dccp_hdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { newport = tuple->src.u.dccp.port; portptr = &hdr->dccph_sport; } else { newport = tuple->dst.u.dccp.port; portptr = &hdr->dccph_dport; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, false); #endif return true; } static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); switch (hdr->type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMP_INFO_REQUEST: case ICMP_INFO_REPLY: case ICMP_ADDRESS: case ICMP_ADDRESSREPLY: break; default: return true; } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; } static bool icmpv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmp6hdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); nf_csum_update(skb, iphdroff, &hdr->icmp6_cksum, tuple, maniptype); if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || hdr->icmp6_type == ICMPV6_ECHO_REPLY) { inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, hdr->icmp6_identifier, tuple->src.u.icmp.id, false); hdr->icmp6_identifier = tuple->src.u.icmp.id; } return true; } /* manipulate a GRE packet according to maniptype */ static bool gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) const struct gre_base_hdr *greh; struct pptp_gre_header *pgreh; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; pgreh = (struct pptp_gre_header *)greh; /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != NF_NAT_MANIP_DST) return true; switch (greh->flags & GRE_VERSION) { case GRE_VERSION_0: /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; case GRE_VERSION_1: pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; default: pr_debug("can't nat unknown GRE version\n"); return false; } #endif return true; } static bool l4proto_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { switch (tuple->dst.protonum) { case IPPROTO_TCP: return tcp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDP: return udp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDPLITE: return udplite_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_SCTP: return sctp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMP: return icmp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_DCCP: return dccp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); } /* If we don't know protocol -- no error, pass it unmodified. */ return true; } static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { struct iphdr *iph; unsigned int hdroff; if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; hdroff = iphdroff + iph->ihl * 4; if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; iph = (void *)skb->data + iphdroff; if (maniptype == NF_NAT_MANIP_SRC) { csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; } else { csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; } return true; } static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *ipv6h; __be16 frag_off; int hdroff; u8 nexthdr; if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; nexthdr = ipv6h->nexthdr; hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h), &nexthdr, &frag_off); if (hdroff < 0) goto manip_addr; if ((frag_off & htons(~0x7)) == 0 && !l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; /* must reload, offset might have changed */ ipv6h = (void *)skb->data + iphdroff; manip_addr: if (maniptype == NF_NAT_MANIP_SRC) ipv6h->saddr = target->src.u3.in6; else ipv6h->daddr = target->dst.u3.in6; #endif return true; } unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir) { struct nf_conntrack_tuple target; /* We are aiming to look like inverse of other direction. */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); switch (target.src.l3num) { case NFPROTO_IPV6: if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; case NFPROTO_IPV4: if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; default: WARN_ON_ONCE(1); break; } return NF_DROP; } static void nf_nat_ipv4_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); __be32 oldip, newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = iph->saddr; newip = t->src.u3.ip; } else { oldip = iph->daddr; newip = t->dst.u3.ip; } inet_proto_csum_replace4(check, skb, oldip, newip, true); } static void nf_nat_ipv6_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff); const struct in6_addr *oldip, *newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = &ipv6h->saddr; newip = &t->src.u3.in6; } else { oldip = &ipv6h->daddr; newip = &t->dst.u3.in6; } inet_proto_csum_replace16(check, skb, oldip->s6_addr32, newip->s6_addr32, true); #endif } static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { switch (t->src.l3num) { case NFPROTO_IPV4: nf_nat_ipv4_csum_update(skb, iphdroff, check, t, maniptype); return; case NFPROTO_IPV6: nf_nat_ipv6_csum_update(skb, iphdroff, check, t, maniptype); return; } } static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct iphdr *iph = ip_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + ip_hdrlen(skb); skb->csum_offset = (void *)check - data; *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #if IS_ENABLED(CONFIG_IPV6) static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + (data - (void *)skb->data); skb->csum_offset = (void *)check - data; *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #endif void nf_nat_csum_recalc(struct sk_buff *skb, u8 nfproto, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { switch (nfproto) { case NFPROTO_IPV4: nf_nat_ipv4_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: nf_nat_ipv6_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #endif } WARN_ON_ONCE(1); } int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum) { struct { struct icmphdr icmp; struct iphdr ip; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp.type == ICMP_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt may reallocate */ inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } /* Change outer to look like the reply to an incoming packet */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMP; if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); static unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, state->hook)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); return ret; } #ifdef CONFIG_XFRM static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) { struct sock *sk = skb->sk; struct dst_entry *dst; unsigned int hh_len; struct flowi fl; int err; err = xfrm_decode_session(net, skb, &fl, family); if (err < 0) return err; dst = skb_dst(skb); if (dst->xfrm) dst = ((struct xfrm_dst *)dst)->route; if (!dst_hold_safe(dst)) return -EHOSTUNREACH; if (sk && !net_eq(net, sock_net(sk))) sk = NULL; dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_drop(skb); skb_dst_set(skb, dst); /* Change in oif may mean change in hh_len. */ hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) return -ENOMEM; return 0; } #endif static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport) { enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: case IPPROTO_UDP: break; default: return false; } dir = CTINFO2DIR(ctinfo); if (dir != IP_CT_DIR_ORIGINAL) return false; return ct->tuplehash[!dir].tuple.dst.u.all != sport; } static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { __be32 saddr = ip_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* skb has a socket assigned via tcp edemux. We need to check * if nf_nat_ipv4_fn() has mangled the packet in a way that * edemux would not have found this socket. * * This includes both changes to the source address and changes * to the source port, which are both handled by the * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux * might have found a socket for the old (pre-snat) address. */ if (saddr != ip_hdr(skb)->saddr || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); /* TCP edemux obtained wrong socket */ return ret; } static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.src.u3.ip != ct->tuplehash[!dir].tuple.dst.u3.ip || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_pre_routing, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_out, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_local_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, }; int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn); #if IS_ENABLED(CONFIG_IPV6) int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen) { struct { struct icmp6hdr icmp6; struct ipv6hdr ip6; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp6.icmp6_type == NDISC_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); inside = (void *)skb->data + hdrlen; inside->icmp6.icmp6_cksum = 0; inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMPV6; if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); static unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be16 frag_off; int hdrlen; u8 nexthdr; ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { nexthdr = ipv6_hdr(skb)->nexthdr; hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, state->hook, hdrlen)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct in6_addr saddr = ipv6_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* see nf_nat_ipv4_local_in */ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); return ret; } static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret, verdict; struct in6_addr daddr = ipv6_hdr(skb)->daddr; ret = nf_nat_ipv6_fn(priv, skb, state); verdict = ret & NF_VERDICT_MASK; if (verdict != NF_DROP && verdict != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); return ret; } static unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3) || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_out, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_local_fn, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_local_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, }, }; int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn); #endif /* CONFIG_IPV6 */ #if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT) int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops) { int ret; if (WARN_ON_ONCE(ops->pf != NFPROTO_INET)) return -EINVAL; ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); if (ret) return ret; ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); if (ret) nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); return ret; } EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn); #endif /* NFT INET NAT */ |
| 42 42 42 40 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtual NCI device simulation driver * * Copyright (C) 2020 Samsung Electrnoics * Bongsu Jeon <bongsu.jeon@samsung.com> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/wait.h> #include <net/nfc/nci_core.h> #define IOCTL_GET_NCIDEV_IDX 0 #define VIRTUAL_NFC_PROTOCOLS (NFC_PROTO_JEWEL_MASK | \ NFC_PROTO_MIFARE_MASK | \ NFC_PROTO_FELICA_MASK | \ NFC_PROTO_ISO14443_MASK | \ NFC_PROTO_ISO14443_B_MASK | \ NFC_PROTO_ISO15693_MASK) struct virtual_nci_dev { struct nci_dev *ndev; struct mutex mtx; struct sk_buff *send_buff; struct wait_queue_head wq; bool running; }; static int virtual_nci_open(struct nci_dev *ndev) { struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); vdev->running = true; return 0; } static int virtual_nci_close(struct nci_dev *ndev) { struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); mutex_lock(&vdev->mtx); kfree_skb(vdev->send_buff); vdev->send_buff = NULL; vdev->running = false; mutex_unlock(&vdev->mtx); return 0; } static int virtual_nci_send(struct nci_dev *ndev, struct sk_buff *skb) { struct virtual_nci_dev *vdev = nci_get_drvdata(ndev); mutex_lock(&vdev->mtx); if (vdev->send_buff || !vdev->running) { mutex_unlock(&vdev->mtx); kfree_skb(skb); return -1; } vdev->send_buff = skb_copy(skb, GFP_KERNEL); if (!vdev->send_buff) { mutex_unlock(&vdev->mtx); kfree_skb(skb); return -1; } mutex_unlock(&vdev->mtx); wake_up_interruptible(&vdev->wq); consume_skb(skb); return 0; } static const struct nci_ops virtual_nci_ops = { .open = virtual_nci_open, .close = virtual_nci_close, .send = virtual_nci_send }; static ssize_t virtual_ncidev_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct virtual_nci_dev *vdev = file->private_data; size_t actual_len; mutex_lock(&vdev->mtx); while (!vdev->send_buff) { mutex_unlock(&vdev->mtx); if (wait_event_interruptible(vdev->wq, vdev->send_buff)) return -EFAULT; mutex_lock(&vdev->mtx); } actual_len = min_t(size_t, count, vdev->send_buff->len); if (copy_to_user(buf, vdev->send_buff->data, actual_len)) { mutex_unlock(&vdev->mtx); return -EFAULT; } skb_pull(vdev->send_buff, actual_len); if (vdev->send_buff->len == 0) { consume_skb(vdev->send_buff); vdev->send_buff = NULL; } mutex_unlock(&vdev->mtx); return actual_len; } static ssize_t virtual_ncidev_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct virtual_nci_dev *vdev = file->private_data; struct sk_buff *skb; skb = alloc_skb(count, GFP_KERNEL); if (!skb) return -ENOMEM; if (copy_from_user(skb_put(skb, count), buf, count)) { kfree_skb(skb); return -EFAULT; } nci_recv_frame(vdev->ndev, skb); return count; } static int virtual_ncidev_open(struct inode *inode, struct file *file) { int ret = 0; struct virtual_nci_dev *vdev; vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); if (!vdev) return -ENOMEM; vdev->ndev = nci_allocate_device(&virtual_nci_ops, VIRTUAL_NFC_PROTOCOLS, 0, 0); if (!vdev->ndev) { kfree(vdev); return -ENOMEM; } mutex_init(&vdev->mtx); init_waitqueue_head(&vdev->wq); file->private_data = vdev; nci_set_drvdata(vdev->ndev, vdev); ret = nci_register_device(vdev->ndev); if (ret < 0) { nci_free_device(vdev->ndev); mutex_destroy(&vdev->mtx); kfree(vdev); return ret; } return 0; } static int virtual_ncidev_close(struct inode *inode, struct file *file) { struct virtual_nci_dev *vdev = file->private_data; nci_unregister_device(vdev->ndev); nci_free_device(vdev->ndev); mutex_destroy(&vdev->mtx); kfree(vdev); return 0; } static long virtual_ncidev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct virtual_nci_dev *vdev = file->private_data; const struct nfc_dev *nfc_dev = vdev->ndev->nfc_dev; void __user *p = (void __user *)arg; if (cmd != IOCTL_GET_NCIDEV_IDX) return -ENOTTY; if (copy_to_user(p, &nfc_dev->idx, sizeof(nfc_dev->idx))) return -EFAULT; return 0; } static const struct file_operations virtual_ncidev_fops = { .owner = THIS_MODULE, .read = virtual_ncidev_read, .write = virtual_ncidev_write, .open = virtual_ncidev_open, .release = virtual_ncidev_close, .unlocked_ioctl = virtual_ncidev_ioctl }; static struct miscdevice miscdev = { .minor = MISC_DYNAMIC_MINOR, .name = "virtual_nci", .fops = &virtual_ncidev_fops, .mode = 0600, }; module_misc_device(miscdev); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Virtual NCI device simulation driver"); MODULE_AUTHOR("Bongsu Jeon <bongsu.jeon@samsung.com>"); |
| 11 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2003 * Copyright (c) Cisco 1999,2000 * Copyright (c) Motorola 1999,2000,2001 * Copyright (c) La Monte H.P. Yarroll 2001 * * This file is part of the SCTP kernel implementation. * * A collection class to handle the storage of transport addresses. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags); static void sctp_bind_addr_clean(struct sctp_bind_addr *); /* First Level Abstractions. */ /* Copy 'src' to 'dest' taking 'scope' into account. Omit addresses * in 'src' which have a broader scope than 'scope'. */ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, enum sctp_scope scope, gfp_t gfp, int flags) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; /* Extract the addresses which are relevant for this scope. */ list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, scope, gfp, flags); if (error < 0) goto out; } /* If there are no addresses matching the scope and * this is global scope, try to get a link scope address, with * the assumption that we must be sitting behind a NAT. */ if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) { list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, SCTP_SCOPE_LINK, gfp, flags); if (error < 0) goto out; } } /* If somehow no addresses were found that can be used with this * scope, it's an error. */ if (list_empty(&dest->address_list)) error = -ENETUNREACH; out: if (error) sctp_bind_addr_clean(dest); return error; } /* Exactly duplicate the address lists. This is necessary when doing * peer-offs and accepts. We don't want to put all the current system * addresses into the endpoint. That's useless. But we do want duplicat * the list of bound addresses that the older endpoint used. */ int sctp_bind_addr_dup(struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, gfp_t gfp) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; list_for_each_entry(addr, &src->address_list, list) { error = sctp_add_bind_addr(dest, &addr->a, sizeof(addr->a), 1, gfp); if (error < 0) break; } return error; } /* Initialize the SCTP_bind_addr structure for either an endpoint or * an association. */ void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port) { INIT_LIST_HEAD(&bp->address_list); bp->port = port; } /* Dispose of the address list. */ static void sctp_bind_addr_clean(struct sctp_bind_addr *bp) { struct sctp_sockaddr_entry *addr, *temp; /* Empty the bind address list. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { list_del_rcu(&addr->list); kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); } } /* Dispose of an SCTP_bind_addr structure */ void sctp_bind_addr_free(struct sctp_bind_addr *bp) { /* Empty the bind address list. */ sctp_bind_addr_clean(bp); } /* Add an address to the bind address list in the SCTP_bind_addr structure. */ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, int new_size, __u8 addr_state, gfp_t gfp) { struct sctp_sockaddr_entry *addr; /* Add the address to the bind address list. */ addr = kzalloc(sizeof(*addr), gfp); if (!addr) return -ENOMEM; memcpy(&addr->a, new, min_t(size_t, sizeof(*new), new_size)); /* Fix up the port if it has not yet been set. * Both v4 and v6 have the port at the same offset. */ if (!addr->a.v4.sin_port) addr->a.v4.sin_port = htons(bp->port); addr->state = addr_state; addr->valid = 1; INIT_LIST_HEAD(&addr->list); /* We always hold a socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_add_tail_rcu(&addr->list, &bp->address_list); SCTP_DBG_OBJCNT_INC(addr); return 0; } /* Delete an address from the bind address list in the SCTP_bind_addr * structure. */ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr) { struct sctp_sockaddr_entry *addr, *temp; int found = 0; /* We hold the socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { if (sctp_cmp_addr_exact(&addr->a, del_addr)) { /* Found the exact match. */ found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } if (found) { kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); return 0; } return -EINVAL; } /* Create a network byte-order representation of all the addresses * formated as SCTP parameters. * * The second argument is the return value for the length. */ union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, int *addrs_len, gfp_t gfp) { union sctp_params addrparms; union sctp_params retval; int addrparms_len; union sctp_addr_param rawaddr; int len; struct sctp_sockaddr_entry *addr; struct list_head *pos; struct sctp_af *af; addrparms_len = 0; len = 0; /* Allocate enough memory at once. */ list_for_each(pos, &bp->address_list) { len += sizeof(union sctp_addr_param); } /* Don't even bother embedding an address if there * is only one. */ if (len == sizeof(union sctp_addr_param)) { retval.v = NULL; goto end_raw; } retval.v = kmalloc(len, gfp); if (!retval.v) goto end_raw; addrparms = retval; list_for_each_entry(addr, &bp->address_list, list) { af = sctp_get_af_specific(addr->a.v4.sin_family); len = af->to_addr_param(&addr->a, &rawaddr); memcpy(addrparms.v, &rawaddr, len); addrparms.v += len; addrparms_len += len; } end_raw: *addrs_len = addrparms_len; return retval; } /* * Create an address list out of the raw address list format (IPv4 and IPv6 * address parameters). */ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, int addrs_len, __u16 port, gfp_t gfp) { union sctp_addr_param *rawaddr; struct sctp_paramhdr *param; union sctp_addr addr; int retval = 0; int len; struct sctp_af *af; /* Convert the raw address to standard address format */ while (addrs_len) { param = (struct sctp_paramhdr *)raw_addr_list; rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); if (unlikely(!af) || !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; goto out_err; } if (sctp_bind_addr_state(bp, &addr) != -1) goto next; retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), SCTP_ADDR_SRC, gfp); if (retval) /* Can't finish building the list, clean up. */ goto out_err; next: len = ntohs(param->length); addrs_len -= len; raw_addr_list += len; } return retval; out_err: if (retval) sctp_bind_addr_clean(bp); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Does this contain a specified address? Allow wildcarding. */ int sctp_bind_addr_match(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; int match = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) { match = 1; break; } } rcu_read_unlock(); return match; } int sctp_bind_addrs_check(struct sctp_sock *sp, struct sctp_sock *sp2, int cnt2) { struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; struct sctp_sockaddr_entry *laddr, *laddr2; bool exist = false; int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && laddr->valid && laddr2->valid) { exist = true; goto next; } } cnt = 0; break; next: cnt++; } rcu_read_unlock(); return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); } /* Does the address 'addr' conflict with any addresses in * the bp. */ int sctp_bind_addr_conflict(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *bp_sp, struct sctp_sock *addr_sp) { struct sctp_sockaddr_entry *laddr; int conflict = 0; struct sctp_sock *sp; /* Pick the IPv6 socket as the basis of comparison * since it's usually a superset of the IPv4. * If there is no IPv6 socket, then default to bind_addr. */ if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6) sp = bp_sp; else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6) sp = addr_sp; else sp = bp_sp; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; conflict = sp->pf->cmp_addr(&laddr->a, addr, sp); if (conflict) break; } rcu_read_unlock(); return conflict; } /* Get the state of the entry in the bind_addr_list */ int sctp_bind_addr_state(const struct sctp_bind_addr *bp, const union sctp_addr *addr) { struct sctp_sockaddr_entry *laddr; struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (unlikely(!af)) return -1; list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (af->cmp_addr(&laddr->a, addr)) return laddr->state; } return -1; } /* Find the first address in the bind address list that is not present in * the addrs packed array. */ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, const union sctp_addr *addrs, int addrcnt, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; union sctp_addr *addr; void *addr_buf; struct sctp_af *af; int i; /* This is only called sctp_send_asconf_del_ip() and we hold * the socket lock in that code patch, so that address list * can't change. */ list_for_each_entry(laddr, &bp->address_list, list) { addr_buf = (union sctp_addr *)addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); if (!af) break; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) break; addr_buf += af->sockaddr_len; } if (i == addrcnt) return &laddr->a; } return NULL; } /* Copy out addresses from the global local address list. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags) { int error = 0; if (sctp_is_any(NULL, addr)) { error = sctp_copy_local_addr_list(net, dest, scope, gfp, flags); } else if (sctp_in_scope(net, addr, scope)) { /* Now that the address is in scope, check to see if * the address type is supported by local sock as * well as the remote peer. */ if ((((AF_INET == addr->sa.sa_family) && (flags & SCTP_ADDR4_ALLOWED) && (flags & SCTP_ADDR4_PEERSUPP))) || (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && (flags & SCTP_ADDR6_PEERSUPP)))) error = sctp_add_bind_addr(dest, addr, sizeof(*addr), SCTP_ADDR_SRC, gfp); } return error; } /* Is this a wildcard address? */ int sctp_is_any(struct sock *sk, const union sctp_addr *addr) { unsigned short fam = 0; struct sctp_af *af; /* Try to get the right address family */ if (addr->sa.sa_family != AF_UNSPEC) fam = addr->sa.sa_family; else if (sk) fam = sk->sk_family; af = sctp_get_af_specific(fam); if (!af) return 0; return af->is_any(addr); } /* Is 'addr' valid for 'scope'? */ int sctp_in_scope(struct net *net, const union sctp_addr *addr, enum sctp_scope scope) { enum sctp_scope addr_scope = sctp_scope(addr); /* The unusable SCTP addresses will not be considered with * any defined scopes. */ if (SCTP_SCOPE_UNUSABLE == addr_scope) return 0; /* * For INIT and INIT-ACK address list, let L be the level of * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * * Address scoping can be selectively controlled via sysctl * option */ switch (net->sctp.scope_policy) { case SCTP_SCOPE_POLICY_DISABLE: return 1; case SCTP_SCOPE_POLICY_ENABLE: if (addr_scope <= scope) return 1; break; case SCTP_SCOPE_POLICY_PRIVATE: if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope) return 1; break; case SCTP_SCOPE_POLICY_LINK: if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope) return 1; break; default: break; } return 0; } int sctp_is_ep_boundall(struct sock *sk) { struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *addr; bp = &sctp_sk(sk)->ep->base.bind_addr; if (sctp_list_single_entry(&bp->address_list)) { addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(sk, &addr->a)) return 1; } return 0; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ /* What is the scope of 'addr'? */ enum sctp_scope sctp_scope(const union sctp_addr *addr) { struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (!af) return SCTP_SCOPE_UNUSABLE; return af->scope((union sctp_addr *)addr); } |
| 58 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vmalloc #if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VMALLOC_H #include <linux/tracepoint.h> /** * alloc_vmap_area - called when a new vmap allocation occurs * @addr: an allocated address * @size: a requested size * @align: a requested alignment * @vstart: a requested start range * @vend: a requested end range * @failed: an allocation failed or not * * This event is used for a debug purpose, it can give an extra * information for a developer about how often it occurs and which * parameters are passed for further validation. */ TRACE_EVENT(alloc_vmap_area, TP_PROTO(unsigned long addr, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int failed), TP_ARGS(addr, size, align, vstart, vend, failed), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, size) __field(unsigned long, align) __field(unsigned long, vstart) __field(unsigned long, vend) __field(int, failed) ), TP_fast_assign( __entry->addr = addr; __entry->size = size; __entry->align = align; __entry->vstart = vstart; __entry->vend = vend; __entry->failed = failed; ), TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d", __entry->addr, __entry->size, __entry->align, __entry->vstart, __entry->vend, __entry->failed) ); /** * purge_vmap_area_lazy - called when vmap areas were lazily freed * @start: purging start address * @end: purging end address * @npurged: numbed of purged vmap areas * * This event is used for a debug purpose. It gives some * indication about start:end range and how many objects * are released. */ TRACE_EVENT(purge_vmap_area_lazy, TP_PROTO(unsigned long start, unsigned long end, unsigned int npurged), TP_ARGS(start, end, npurged), TP_STRUCT__entry( __field(unsigned long, start) __field(unsigned long, end) __field(unsigned int, npurged) ), TP_fast_assign( __entry->start = start; __entry->end = end; __entry->npurged = npurged; ), TP_printk("start=0x%lx end=0x%lx num_purged=%u", __entry->start, __entry->end, __entry->npurged) ); /** * free_vmap_area_noflush - called when a vmap area is freed * @va_start: a start address of VA * @nr_lazy: number of current lazy pages * @nr_lazy_max: number of maximum lazy pages * * This event is used for a debug purpose. It gives some * indication about a VA that is released, number of current * outstanding areas and a maximum allowed threshold before * dropping all of them. */ TRACE_EVENT(free_vmap_area_noflush, TP_PROTO(unsigned long va_start, unsigned long nr_lazy, unsigned long nr_lazy_max), TP_ARGS(va_start, nr_lazy, nr_lazy_max), TP_STRUCT__entry( __field(unsigned long, va_start) __field(unsigned long, nr_lazy) __field(unsigned long, nr_lazy_max) ), TP_fast_assign( __entry->va_start = va_start; __entry->nr_lazy = nr_lazy; __entry->nr_lazy_max = nr_lazy_max; ), TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu", __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max) ); #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 1 1 1 19 1062 1061 19 1061 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/blkdev.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/kthread.h> #include <linux/backing-dev.h> #include <linux/blk-cgroup.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> #include <trace/events/writeback.h> #include "internal.h" struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); static u64 bdi_id_cursor; static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> static struct dentry *bdi_debug_root; static void bdi_debug_init(void) { bdi_debug_root = debugfs_create_dir("bdi", NULL); } static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long wb_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) nr_dirty_time++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); wb_thresh = wb_calc_thresh(wb, dirty_thresh); seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" "BdiDirtyThresh: %10lu kB\n" "DirtyThresh: %10lu kB\n" "BackgroundThresh: %10lu kB\n" "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), K(wb_thresh), K(dirty_thresh), K(background_thresh), (unsigned long) K(wb_stat(wb, WB_DIRTIED)), (unsigned long) K(wb_stat(wb, WB_WRITTEN)), (unsigned long) K(wb->write_bandwidth), nr_dirty, nr_io, nr_more_io, nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } #else static inline void bdi_debug_init(void) { } static inline void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { } static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } #endif static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned long read_ahead_kb; ssize_t ret; ret = kstrtoul(buf, 10, &read_ahead_kb); if (ret < 0) return ret; bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); return count; } #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) static ssize_t min_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) static ssize_t max_ratio_fine_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio_no_scale(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio_fine, bdi->max_ratio) static ssize_t min_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); } static ssize_t min_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_min_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(min_bytes); static ssize_t max_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); } static ssize_t max_bytes_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); u64 bytes; ssize_t ret; ret = kstrtoull(buf, 10, &bytes); if (ret < 0) return ret; ret = bdi_set_max_bytes(bdi, bytes); if (!ret) ret = count; return ret; } static DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); return sysfs_emit(buf, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); static ssize_t strict_limit_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int strict_limit; ssize_t ret; ret = kstrtouint(buf, 10, &strict_limit); if (ret < 0) return ret; ret = bdi_set_strict_limit(bdi, strict_limit); if (!ret) ret = count; return ret; } static ssize_t strict_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { struct backing_dev_info *bdi = dev_get_drvdata(dev); return sysfs_emit(buf, "%d\n", !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); } static DEVICE_ATTR_RW(strict_limit); static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); static const struct class bdi_class = { .name = "bdi", .dev_groups = bdi_dev_groups, }; static __init int bdi_class_init(void) { int ret; ret = class_register(&bdi_class); if (ret) return ret; bdi_debug_init(); return 0; } postcore_initcall(bdi_class_init); static int __init default_bdi_init(void) { bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; return 0; } subsys_initcall(default_bdi_init); /* * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would * starts only 'dirty_writeback_interval' centisecs from now anyway, we just * set up a timer which wakes the bdi thread up later. * * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. * * We have to be careful not to postpone flush work if it is scheduled for * earlier. Thus we use queue_delayed_work(). */ void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) queue_delayed_work(bdi_wq, &wb->dwork, timeout); spin_unlock_irq(&wb->work_lock); } static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, bw_dwork); wb_update_bandwidth(wb); } /* * Initial write bandwidth: 100 MB/s */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { int i, err; memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); if (err) return err; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); if (err) goto out_destroy_stat; } return 0; out_destroy_stat: while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); return err; } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); /* * Remove bdi from the global list and shutdown any threads we have running */ static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ spin_lock_irq(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_irq(&wb->work_lock); return; } spin_unlock_irq(&wb->work_lock); cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to * be drained no matter what. */ mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) { int i; WARN_ON(delayed_work_pending(&wb->dwork)); for (i = 0; i < NR_WB_STAT_ITEMS; i++) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/memcontrol.h> /* * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; static LIST_HEAD(offline_cgwbs); static void cleanup_offline_cgwbs_workfn(struct work_struct *work); static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); static void cgwb_free_rcu(struct rcu_head *rcu_head) { struct bdi_writeback *wb = container_of(rcu_head, struct bdi_writeback, rcu); percpu_ref_exit(&wb->refcnt); kfree(wb); } static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(wb->blkcg_css); fprop_local_destroy_percpu(&wb->memcg_completions); spin_lock_irq(&cgwb_lock); list_del(&wb->offline_node); spin_unlock_irq(&cgwb_lock); wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); call_rcu(&wb->rcu, cgwb_free_rcu); } static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) { lockdep_assert_held(&cgwb_lock); WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); list_add(&wb->offline_node, &offline_cgwbs); percpu_ref_kill(&wb->refcnt); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { spin_lock_irq(&cgwb_lock); list_del_rcu(&wb->bdi_node); spin_unlock_irq(&cgwb_lock); } static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; int ret = 0; memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb && wb->blkcg_css != blkcg_css) { cgwb_kill(wb); wb = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); if (wb) goto out_put; /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); if (!wb) { ret = -ENOMEM; goto out_put; } ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); if (ret) goto err_wb_exit; ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); if (ret) goto err_ref_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate * whether they're still online. Don't link @wb if any is dead. * See wb_memcg_offline() and wb_blkcg_offline(). */ ret = -ENODEV; spin_lock_irqsave(&cgwb_lock, flags); if (test_bit(WB_registered, &bdi->wb.state) && blkcg_cgwb_list->next && memcg_cgwb_list->next) { /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); blkcg_pin_online(blkcg_css); css_get(memcg_css); css_get(blkcg_css); } } spin_unlock_irqrestore(&cgwb_lock, flags); if (ret) { if (ret == -EEXIST) ret = 0; goto err_fprop_exit; } goto out_put; err_fprop_exit: bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: wb_exit(wb); err_free: kfree(wb); out_put: css_put(blkcg_css); return ret; } /** * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * * Try to get the wb for @memcg_css on @bdi. The returned wb has its * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on * @memcg_css isn't enough. try_get it before calling this function. * * A wb is keyed by its associated memcg. As blkcg implicitly enables * memcg on the default hierarchy, memcg association is guaranteed to be * more specific (equal or descendant to the associated blkcg) and thus can * identify both the memcg and blkcg associations. * * Because the blkcg associated with a memcg may change as blkcg is enabled * and disabled closer to root in the hierarchy, each wb keeps track of * both the memcg and blkcg associated with it and verifies the blkcg on * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css) { struct bdi_writeback *wb; if (!memcg_css->parent) return &bdi->wb; rcu_read_lock(); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb) { struct cgroup_subsys_state *blkcg_css; /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; css_put(blkcg_css); } rcu_read_unlock(); return wb; } /** * wb_get_create - get wb for a given memcg, create if necessary * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * @gfp: allocation mask to use * * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to * create one. See wb_get_lookup() for more details. */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct bdi_writeback *wb; might_alloc(gfp); do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; } static int cgwb_bdi_init(struct backing_dev_info *bdi) { int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); spin_unlock_irq(&cgwb_lock); mutex_lock(&bdi->cgwb_release_mutex); spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); mutex_unlock(&bdi->cgwb_release_mutex); } /* * cleanup_offline_cgwbs_workfn - try to release dying cgwbs * * Try to release dying cgwbs by switching attached inodes to the nearest * living ancestor's writeback. Processed wbs are placed at the end * of the list to guarantee the forward progress. */ static void cleanup_offline_cgwbs_workfn(struct work_struct *work) { struct bdi_writeback *wb; LIST_HEAD(processed); spin_lock_irq(&cgwb_lock); while (!list_empty(&offline_cgwbs)) { wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, offline_node); list_move(&wb->offline_node, &processed); /* * If wb is dirty, cleaning up the writeback by switching * attached inodes will result in an effective removal of any * bandwidth restrictions, which isn't the goal. Instead, * it can be postponed until the next time, when all io * will be likely completed. If in the meantime some inodes * will get re-dirtied, they should be eventually switched to * a new cgwb. */ if (wb_has_dirty_io(wb)) continue; if (!wb_tryget(wb)) continue; spin_unlock_irq(&cgwb_lock); while (cleanup_offline_cgwb(wb)) cond_resched(); spin_lock_irq(&cgwb_lock); wb_put(wb); } if (!list_empty(&processed)) list_splice_tail(&processed, &offline_cgwbs); spin_unlock_irq(&cgwb_lock); } /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined * * Also prevents creation of any new wb's associated with @memcg. */ void wb_memcg_offline(struct mem_cgroup *memcg) { struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); } /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined * @css: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ void wb_blkcg_offline(struct cgroup_subsys_state *css) { struct bdi_writeback *wb, *next; struct list_head *list = blkcg_get_cgwb_list(css); spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, list, blkcg_node) cgwb_kill(wb); list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); spin_unlock_irq(&cgwb_lock); } static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming * system_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); if (!cgwb_release_wq) return -ENOMEM; return 0; } subsys_initcall(cgwb_init); #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) { return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { list_del_rcu(&wb->bdi_node); } #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { kfree(bdi); return NULL; } bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { struct rb_node **p = &bdi_tree.rb_node; struct rb_node *parent = NULL; struct backing_dev_info *bdi; lockdep_assert_held(&bdi_lock); while (*p) { parent = *p; bdi = rb_entry(parent, struct backing_dev_info, rb_node); if (bdi->id > id) p = &(*p)->rb_left; else if (bdi->id < id) p = &(*p)->rb_right; else break; } if (parentp) *parentp = parent; return p; } /** * bdi_get_by_id - lookup and get bdi from its id * @id: bdi id to lookup * * Find bdi matching @id and get it. Returns NULL if the matching bdi * doesn't exist or is already unregistered. */ struct backing_dev_info *bdi_get_by_id(u64 id) { struct backing_dev_info *bdi = NULL; struct rb_node **p; spin_lock_bh(&bdi_lock); p = bdi_lookup_rb_node(id, NULL); if (*p) { bdi = rb_entry(*p, struct backing_dev_info, rb_node); bdi_get(bdi); } spin_unlock_bh(&bdi_lock); return bdi; } int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); bdi->id = ++bdi_id_cursor; p = bdi_lookup_rb_node(bdi->id, &parent); rb_link_node(&bdi->rb_node, parent, p); rb_insert_color(&bdi->rb_node, &bdi_tree); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); return 0; } int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { va_list args; int ret; va_start(args, fmt); ret = bdi_register_va(bdi, fmt, args); va_end(args); return ret; } EXPORT_SYMBOL(bdi_register); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); } /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); } void bdi_unregister(struct backing_dev_info *bdi) { del_timer_sync(&bdi->laptop_mode_wb_timer); /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi); /* * If this BDI's min ratio has been set, use bdi_set_min_ratio() to * update the global bdi_min_ratio. */ if (bdi->min_ratio) bdi_set_min_ratio(bdi, 0); if (bdi->dev) { bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; } if (bdi->owner) { put_device(bdi->owner); bdi->owner = NULL; } } EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); } void bdi_put(struct backing_dev_info *bdi) { kref_put(&bdi->refcnt, release_bdi); } EXPORT_SYMBOL(bdi_put); struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb; if (!inode) return &noop_backing_dev_info; sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } EXPORT_SYMBOL(inode_to_bdi); const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | // SPDX-License-Identifier: GPL-2.0+ /* * Apple Cinema Display driver * * Copyright (C) 2006 Michael Hanselmann (linux-kernel@hansmi.ch) * * Thanks to Caskey L. Dickson for his work with acdctl. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/backlight.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/atomic.h> #define APPLE_VENDOR_ID 0x05AC #define USB_REQ_GET_REPORT 0x01 #define USB_REQ_SET_REPORT 0x09 #define ACD_USB_TIMEOUT 250 #define ACD_USB_EDID 0x0302 #define ACD_USB_BRIGHTNESS 0x0310 #define ACD_BTN_NONE 0 #define ACD_BTN_BRIGHT_UP 3 #define ACD_BTN_BRIGHT_DOWN 4 #define ACD_URB_BUFFER_LEN 2 #define ACD_MSG_BUFFER_LEN 2 #define APPLEDISPLAY_DEVICE(prod) \ .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \ USB_DEVICE_ID_MATCH_INT_CLASS | \ USB_DEVICE_ID_MATCH_INT_PROTOCOL, \ .idVendor = APPLE_VENDOR_ID, \ .idProduct = (prod), \ .bInterfaceClass = USB_CLASS_HID, \ .bInterfaceProtocol = 0x00 /* table of devices that work with this driver */ static const struct usb_device_id appledisplay_table[] = { { APPLEDISPLAY_DEVICE(0x9218) }, { APPLEDISPLAY_DEVICE(0x9219) }, { APPLEDISPLAY_DEVICE(0x921c) }, { APPLEDISPLAY_DEVICE(0x921d) }, { APPLEDISPLAY_DEVICE(0x9222) }, { APPLEDISPLAY_DEVICE(0x9226) }, { APPLEDISPLAY_DEVICE(0x9236) }, /* Terminating entry */ { } }; MODULE_DEVICE_TABLE(usb, appledisplay_table); /* Structure to hold all of our device specific stuff */ struct appledisplay { struct usb_device *udev; /* usb device */ struct urb *urb; /* usb request block */ struct backlight_device *bd; /* backlight device */ u8 *urbdata; /* interrupt URB data buffer */ u8 *msgdata; /* control message data buffer */ struct delayed_work work; int button_pressed; struct mutex sysfslock; /* concurrent read and write */ }; static atomic_t count_displays = ATOMIC_INIT(0); static void appledisplay_complete(struct urb *urb) { struct appledisplay *pdata = urb->context; struct device *dev = &pdata->udev->dev; int status = urb->status; int retval; switch (status) { case 0: /* success */ break; case -EOVERFLOW: dev_err(dev, "OVERFLOW with data length %d, actual length is %d\n", ACD_URB_BUFFER_LEN, pdata->urb->actual_length); fallthrough; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* This urb is terminated, clean up */ dev_dbg(dev, "%s - urb shuttingdown with status: %d\n", __func__, status); return; default: dev_dbg(dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } switch(pdata->urbdata[1]) { case ACD_BTN_BRIGHT_UP: case ACD_BTN_BRIGHT_DOWN: pdata->button_pressed = 1; schedule_delayed_work(&pdata->work, 0); break; case ACD_BTN_NONE: default: pdata->button_pressed = 0; break; } exit: retval = usb_submit_urb(pdata->urb, GFP_ATOMIC); if (retval) { dev_err(dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } } static int appledisplay_bl_update_status(struct backlight_device *bd) { struct appledisplay *pdata = bl_get_data(bd); int retval; mutex_lock(&pdata->sysfslock); pdata->msgdata[0] = 0x10; pdata->msgdata[1] = bd->props.brightness; retval = usb_control_msg( pdata->udev, usb_sndctrlpipe(pdata->udev, 0), USB_REQ_SET_REPORT, USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE, ACD_USB_BRIGHTNESS, 0, pdata->msgdata, 2, ACD_USB_TIMEOUT); mutex_unlock(&pdata->sysfslock); if (retval < 0) return retval; else return 0; } static int appledisplay_bl_get_brightness(struct backlight_device *bd) { struct appledisplay *pdata = bl_get_data(bd); int retval, brightness; mutex_lock(&pdata->sysfslock); retval = usb_control_msg( pdata->udev, usb_rcvctrlpipe(pdata->udev, 0), USB_REQ_GET_REPORT, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, ACD_USB_BRIGHTNESS, 0, pdata->msgdata, 2, ACD_USB_TIMEOUT); if (retval < 2) { if (retval >= 0) retval = -EMSGSIZE; } else { brightness = pdata->msgdata[1]; } mutex_unlock(&pdata->sysfslock); if (retval < 0) return retval; else return brightness; } static const struct backlight_ops appledisplay_bl_data = { .get_brightness = appledisplay_bl_get_brightness, .update_status = appledisplay_bl_update_status, }; static void appledisplay_work(struct work_struct *work) { struct appledisplay *pdata = container_of(work, struct appledisplay, work.work); int retval; retval = appledisplay_bl_get_brightness(pdata->bd); if (retval >= 0) pdata->bd->props.brightness = retval; /* Poll again in about 125ms if there's still a button pressed */ if (pdata->button_pressed) schedule_delayed_work(&pdata->work, HZ / 8); } static int appledisplay_probe(struct usb_interface *iface, const struct usb_device_id *id) { struct backlight_properties props; struct appledisplay *pdata; struct usb_device *udev = interface_to_usbdev(iface); struct usb_endpoint_descriptor *endpoint; int int_in_endpointAddr = 0; int retval, brightness; char bl_name[20]; /* set up the endpoint information */ /* use only the first interrupt-in endpoint */ retval = usb_find_int_in_endpoint(iface->cur_altsetting, &endpoint); if (retval) { dev_err(&iface->dev, "Could not find int-in endpoint\n"); return retval; } int_in_endpointAddr = endpoint->bEndpointAddress; /* allocate memory for our device state and initialize it */ pdata = kzalloc(sizeof(struct appledisplay), GFP_KERNEL); if (!pdata) { retval = -ENOMEM; goto error; } pdata->udev = udev; INIT_DELAYED_WORK(&pdata->work, appledisplay_work); mutex_init(&pdata->sysfslock); /* Allocate buffer for control messages */ pdata->msgdata = kmalloc(ACD_MSG_BUFFER_LEN, GFP_KERNEL); if (!pdata->msgdata) { retval = -ENOMEM; goto error; } /* Allocate interrupt URB */ pdata->urb = usb_alloc_urb(0, GFP_KERNEL); if (!pdata->urb) { retval = -ENOMEM; goto error; } /* Allocate buffer for interrupt data */ pdata->urbdata = usb_alloc_coherent(pdata->udev, ACD_URB_BUFFER_LEN, GFP_KERNEL, &pdata->urb->transfer_dma); if (!pdata->urbdata) { retval = -ENOMEM; dev_err(&iface->dev, "Allocating URB buffer failed\n"); goto error; } /* Configure interrupt URB */ usb_fill_int_urb(pdata->urb, udev, usb_rcvintpipe(udev, int_in_endpointAddr), pdata->urbdata, ACD_URB_BUFFER_LEN, appledisplay_complete, pdata, 1); pdata->urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; if (usb_submit_urb(pdata->urb, GFP_KERNEL)) { retval = -EIO; dev_err(&iface->dev, "Submitting URB failed\n"); goto error; } /* Register backlight device */ snprintf(bl_name, sizeof(bl_name), "appledisplay%d", atomic_inc_return(&count_displays) - 1); memset(&props, 0, sizeof(struct backlight_properties)); props.type = BACKLIGHT_RAW; props.max_brightness = 0xff; pdata->bd = backlight_device_register(bl_name, NULL, pdata, &appledisplay_bl_data, &props); if (IS_ERR(pdata->bd)) { dev_err(&iface->dev, "Backlight registration failed\n"); retval = PTR_ERR(pdata->bd); goto error; } /* Try to get brightness */ brightness = appledisplay_bl_get_brightness(pdata->bd); if (brightness < 0) { retval = brightness; dev_err(&iface->dev, "Error while getting initial brightness: %d\n", retval); goto error; } /* Set brightness in backlight device */ pdata->bd->props.brightness = brightness; /* save our data pointer in the interface device */ usb_set_intfdata(iface, pdata); printk(KERN_INFO "appledisplay: Apple Cinema Display connected\n"); return 0; error: if (pdata) { if (pdata->urb) { usb_kill_urb(pdata->urb); cancel_delayed_work_sync(&pdata->work); usb_free_coherent(pdata->udev, ACD_URB_BUFFER_LEN, pdata->urbdata, pdata->urb->transfer_dma); usb_free_urb(pdata->urb); } if (!IS_ERR(pdata->bd)) backlight_device_unregister(pdata->bd); kfree(pdata->msgdata); } usb_set_intfdata(iface, NULL); kfree(pdata); return retval; } static void appledisplay_disconnect(struct usb_interface *iface) { struct appledisplay *pdata = usb_get_intfdata(iface); if (pdata) { usb_kill_urb(pdata->urb); cancel_delayed_work_sync(&pdata->work); backlight_device_unregister(pdata->bd); usb_free_coherent(pdata->udev, ACD_URB_BUFFER_LEN, pdata->urbdata, pdata->urb->transfer_dma); usb_free_urb(pdata->urb); kfree(pdata->msgdata); kfree(pdata); } printk(KERN_INFO "appledisplay: Apple Cinema Display disconnected\n"); } static struct usb_driver appledisplay_driver = { .name = "appledisplay", .probe = appledisplay_probe, .disconnect = appledisplay_disconnect, .id_table = appledisplay_table, }; module_usb_driver(appledisplay_driver); MODULE_AUTHOR("Michael Hanselmann"); MODULE_DESCRIPTION("Apple Cinema Display driver"); MODULE_LICENSE("GPL"); |
| 39 39 39 38 38 7 7 1 1 40 40 40 40 39 39 39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/mm.h> #include <linux/bitops.h> #include <asm/word-at-a-time.h> /* * Do a strnlen, return length of string *with* final '\0'. * 'count' is the user-supplied count, while 'max' is the * address space maximum. * * Return 0 for exceptions (which includes hitting the address * space maximum), or 'count+1' if hitting the user-supplied * maximum count. * * NOTE! We can sometimes overshoot the user-supplied maximum * if it fits in a aligned 'long'. The caller needs to check * the return value against "> max". */ static __always_inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; unsigned long align, res = 0; unsigned long c; /* * Do everything aligned. But that means that we * need to also expand the maximum.. */ align = (sizeof(unsigned long) - 1) & (unsigned long)src; src -= align; max += align; unsafe_get_user(c, (unsigned long __user *)src, efault); c |= aligned_byte_mask(align); for (;;) { unsigned long data; if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); return res + find_zero(data) + 1 - align; } res += sizeof(unsigned long); /* We already handled 'unsigned long' bytes. Did we do it all ? */ if (unlikely(max <= sizeof(unsigned long))) break; max -= sizeof(unsigned long); unsafe_get_user(c, (unsigned long __user *)(src+res), efault); } res -= align; /* * Uhhuh. We hit 'max'. But was that the user-specified maximum * too? If so, return the marker for "too long". */ if (res >= count) return count+1; /* * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's 0. */ efault: return 0; } /** * strnlen_user: - Get the size of a user string INCLUDING final NUL. * @str: The string to measure. * @count: Maximum count (including NUL character) * * Context: User context only. This function may sleep if pagefaults are * enabled. * * Get the size of a NUL-terminated string in user space. * * Returns the size of the string INCLUDING the terminating NUL. * If the string is too long, returns a number larger than @count. User * has to check the return value against "> count". * On exception (or invalid count), returns 0. * * NOTE! You should basically never use this function. There is * almost never any valid case for using the length of a user space * string, since the string can be changed at any time by other * threads. Use "strncpy_from_user()" instead to get a stable copy * of the string. */ long strnlen_user(const char __user *str, long count) { unsigned long max_addr, src_addr; if (unlikely(count <= 0)) return 0; max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(str); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; /* * Truncate 'max' to the user-specified limit, so that * we only have one limit we need to check in the loop */ if (max > count) max = count; if (user_read_access_begin(str, max)) { retval = do_strnlen_user(str, count, max); user_read_access_end(); return retval; } } return 0; } EXPORT_SYMBOL(strnlen_user); |
| 114 114 114 114 114 114 114 114 114 115 115 115 115 115 5 5 5 5 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> * * Fix by Harrison Xing <harrison@mountainviewdata.com>. * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>. * Extended attributes for symlinks and special files added per * suggestion of Luka Renko <luka.renko@hermes.si>. * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, * Red Hat Inc. * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz * and Andreas Gruenbacher <agruen@suse.de>. */ /* * Extended attributes are stored directly in inodes (on file systems with * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl * field contains the block number if an inode uses an additional block. All * attributes must fit in the inode and one additional block. Blocks that * contain the identical set of attributes may be shared among several inodes. * Identical blocks are detected by keeping a cache of blocks that have * recently been accessed. * * The attributes in inodes and on blocks have a different header; the entries * are stored in the same format: * * +------------------+ * | header | * | entry 1 | | * | entry 2 | | growing downwards * | entry 3 | v * | four null bytes | * | . . . | * | value 1 | ^ * | value 3 | | growing upwards * | value 2 | | * +------------------+ * * The header is followed by multiple entry descriptors. In disk blocks, the * entry descriptors are kept sorted. In inodes, they are unsorted. The * attribute values are aligned to the end of the block in no specific order. * * Locking strategy * ---------------- * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. * EA blocks are only changed if they are exclusive to an inode, so * holding xattr_sem also means that nothing but the EA block's reference * count can change. Multiple writers to the same block are synchronized * by the buffer lock. */ #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, fmt, ...) \ printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) \ printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) #else # define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif static void ext4_xattr_block_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head * ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count); static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count); static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; const struct xattr_handler * const ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif &ext4_xattr_hurd_handler, NULL }; #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_block_cache) #define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_inode_cache) static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) { struct ext4_inode_info *ei = EXT4_I(ea_inode); lockdep_set_subclass(&ea_inode->i_rwsem, 1); (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA); } #endif static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); } static int ext4_xattr_block_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; if (ext4_has_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); unlock_buffer(bh); } return ret; } static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { if (ext4_has_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } static inline const char *ext4_xattr_prefix(int name_index, struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; if (!xattr_handler_can_list(handler, dentry)) return NULL; return xattr_prefix(handler); } static int check_xattrs(struct inode *inode, struct buffer_head *bh, struct ext4_xattr_entry *entry, void *end, void *value_start, const char *function, unsigned int line) { struct ext4_xattr_entry *e = entry; int err = -EFSCORRUPTED; char *err_str; if (bh) { if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { err_str = "invalid header"; goto errout; } if (buffer_verified(bh)) return 0; if (!ext4_xattr_block_csum_verify(inode, bh)) { err = -EFSBADCRC; err_str = "invalid checksum"; goto errout; } } else { struct ext4_xattr_ibody_header *header = value_start; header -= 1; if (end - (void *)header < sizeof(*header) + sizeof(u32)) { err_str = "in-inode xattr block too small"; goto errout; } if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { err_str = "bad magic number in in-inode xattr"; goto errout; } } /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) { err_str = "e_name out of bounds"; goto errout; } if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { err_str = "bad e_name length"; goto errout; } e = next; } /* Check the values */ while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { err_str = "ea_inode specified without ea_inode feature enabled"; goto errout; } if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || !ext4_valid_inum(inode->i_sb, ea_ino))) { err_str = "invalid ea_ino"; goto errout; } if (size > EXT4_XATTR_SIZE_MAX) { err_str = "e_value size too large"; goto errout; } if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); void *value; /* * The value cannot overlap the names, and the value * with padding cannot extend beyond 'end'. Check both * the padded and unpadded sizes, since the size may * overflow to 0 when adding padding. */ if (offs > end - value_start) { err_str = "e_value out of bounds"; goto errout; } value = value_start + offs; if (value < (void *)e + sizeof(u32) || size > end - value || EXT4_XATTR_SIZE(size) > end - value) { err_str = "overlapping e_value "; goto errout; } } entry = EXT4_XATTR_NEXT(entry); } if (bh) set_buffer_verified(bh); return 0; errout: if (bh) __ext4_error_inode(inode, function, line, 0, -err, "corrupted xattr block %llu: %s", (unsigned long long) bh->b_blocknr, err_str); else __ext4_error_inode(inode, function, line, 0, -err, "corrupted in-inode xattr: %s", err_str); return err; } static inline int __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, const char *function, unsigned int line) { return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, bh->b_data, function, line); } #define ext4_xattr_check_block(inode, bh) \ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) static inline int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), function, line); } #define xattr_check_inode(inode, header, end) \ __xattr_check_inode((inode), (header), (end), __func__, __LINE__) static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) { struct ext4_xattr_entry *entry, *next; size_t name_len; int cmp = 1; if (name == NULL) return -EINVAL; name_len = strlen(name); for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) { next = EXT4_XATTR_NEXT(entry); if ((void *) next >= end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); return -EFSCORRUPTED; } cmp = name_index - entry->e_name_index; if (!cmp) cmp = name_len - entry->e_name_len; if (!cmp) cmp = memcmp(name, entry->e_name, name_len); if (cmp <= 0 && (sorted || cmp == 0)) break; } *pentry = entry; return cmp ? -ENODATA : 0; } static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { return ((u64) inode_get_ctime_sec(ea_inode) << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { return (u32) inode_get_atime_sec(ea_inode); } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { inode_set_atime(ea_inode, hash, 0); } /* * Read the EA value from an inode. */ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { int blocksize = 1 << ea_inode->i_blkbits; int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; int tail_size = (size % blocksize) ?: blocksize; struct buffer_head *bhs_inline[8]; struct buffer_head **bhs = bhs_inline; int i, ret; if (bh_count > ARRAY_SIZE(bhs_inline)) { bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); if (!bhs) return -ENOMEM; } ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, true /* wait */, bhs); if (ret) goto free_bhs; for (i = 0; i < bh_count; i++) { /* There shouldn't be any holes in ea_inode. */ if (!bhs[i]) { ret = -EFSCORRUPTED; goto put_bhs; } memcpy((char *)buf + blocksize * i, bhs[i]->b_data, i < bh_count - 1 ? blocksize : tail_size); } ret = 0; put_bhs: for (i = 0; i < bh_count; i++) brelse(bhs[i]); free_bhs: if (bhs != bhs_inline) kfree(bhs); return ret; } #define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode))) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) { struct inode *inode; int err; /* * We have to check for this corruption early as otherwise * iget_locked() could wait indefinitely for the state of our * parent inode. */ if (parent->i_ino == ea_ino) { ext4_error(parent->i_sb, "Parent and EA inode have the same ino %lu", ea_ino); return -EFSCORRUPTED; } inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, "error while reading EA inode %lu err=%d", ea_ino, err); return err; } ext4_xattr_inode_set_class(inode); /* * Check whether this is an old Lustre-style xattr inode. Lustre * implementation does not have hash validation, rather it has a * backpointer from ea_inode to the parent inode. */ if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && inode->i_generation == parent->i_generation) { ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); ext4_xattr_inode_set_ref(inode, 1); } else { inode_lock(inode); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); } *ea_inode = inode; return 0; } /* Remove entry from mbcache when EA inode is getting evicted */ void ext4_evict_ea_inode(struct inode *inode) { struct mb_cache_entry *oe; if (!EA_INODE_CACHE(inode)) return; /* Wait for entry to get unused so that we can remove it */ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), ext4_xattr_inode_get_hash(inode), inode->i_ino))) { mb_cache_entry_wait_unused(oe); mb_cache_entry_put(EA_INODE_CACHE(inode), oe); } } static int ext4_xattr_inode_verify_hashes(struct inode *ea_inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { u32 hash; /* Verify stored hash matches calculated hash. */ hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); if (hash != ext4_xattr_inode_get_hash(ea_inode)) return -EFSCORRUPTED; if (entry) { __le32 e_hash, tmp_data; /* Verify entry hash. */ tmp_data = cpu_to_le32(hash); e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, &tmp_data, 1); /* All good? */ if (e_hash == entry->e_hash) return 0; /* * Not good. Maybe the entry hash was calculated * using the buggy signed char version? */ e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len, &tmp_data, 1); /* Still no match - bad */ if (e_hash != entry->e_hash) return -EFSCORRUPTED; /* Let people know about old hash */ pr_warn_once("ext4: filesystem with signed xattr name hash"); } return 0; } /* * Read xattr value from the EA inode. */ static int ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); struct inode *ea_inode; int err; err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ea_inode = NULL; goto out; } if (i_size_read(ea_inode) != size) { ext4_warning_inode(ea_inode, "ea_inode file size=%llu entry size=%zu", i_size_read(ea_inode), size); err = -EFSCORRUPTED; goto out; } err = ext4_xattr_inode_read(ea_inode, buffer, size); if (err) goto out; if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); if (err) { ext4_warning_inode(ea_inode, "EA inode hash validation failed"); goto out; } if (ea_inode_cache) mb_cache_entry_create(ea_inode_cache, GFP_NOFS, ext4_xattr_inode_get_hash(ea_inode), ea_inode->i_ino, true /* reusable */); } out: iput(ea_inode); return err; } static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct buffer_head *bh = NULL; struct ext4_xattr_entry *entry; size_t size; void *end; int error; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); end = bh->b_data + bh->b_size; error = xattr_find_entry(inode, &entry, end, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = bh->b_data + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(bh); return error; } int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; struct ext4_iloc iloc; size_t size; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return -ENODATA; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = (void *)IFIRST(header) + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(iloc.bh); return error; } /* * ext4_xattr_get() * * Copy an extended attribute into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ int ext4_xattr_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { int error; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) return -ERANGE; down_read(&EXT4_I(inode)->xattr_sem); error = ext4_xattr_ibody_get(inode, name_index, name, buffer, buffer_size); if (error == -ENODATA) error = ext4_xattr_block_get(inode, name_index, name, buffer, buffer_size); up_read(&EXT4_I(inode)->xattr_sem); return error; } static int ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { const char *prefix; prefix = ext4_xattr_prefix(entry->e_name_index, dentry); if (prefix) { size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; if (buffer) { if (size > rest) return -ERANGE; memcpy(buffer, prefix, prefix_len); buffer += prefix_len; memcpy(buffer, entry->e_name, entry->e_name_len); buffer += entry->e_name_len; *buffer++ = 0; } rest -= size; } } return buffer_size - rest; /* total size */ } static int ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); return error; } static int ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: brelse(iloc.bh); return error; } /* * Inode operation listxattr() * * d_inode(dentry)->i_rwsem: don't care * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; down_read(&EXT4_I(d_inode(dentry))->xattr_sem); ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; if (buffer) { buffer += ret; buffer_size -= ret; } ret = ext4_xattr_block_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; ret += ret2; errout: up_read(&EXT4_I(d_inode(dentry))->xattr_sem); return ret; } /* * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is * not set, set it. */ static void ext4_xattr_update_super_block(handle_t *handle, struct super_block *sb) { if (ext4_has_feature_xattr(sb)) return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE) == 0) { lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) { struct ext4_iloc iloc = { .bh = NULL }; struct buffer_head *bh = NULL; struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; ret = xattr_check_inode(inode, header, end); if (ret) goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); bh = NULL; goto out; } ret = ext4_xattr_check_block(inode, bh); if (ret) goto out; for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } *usage = ea_inode_refs + 1; ret = 0; out: brelse(iloc.bh); brelse(bh); return ret; } static inline size_t round_up_cluster(struct inode *inode, size_t length) { struct super_block *sb = inode->i_sb; size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + inode->i_blkbits); size_t mask = ~(cluster_size - 1); return (length + cluster_size - 1) & mask; } static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) { int err; err = dquot_alloc_inode(inode); if (err) return err; err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); if (err) dquot_free_inode(inode); return err; } static void ext4_xattr_inode_free_quota(struct inode *parent, struct inode *ea_inode, size_t len) { if (ea_inode && ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) return; dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); dquot_free_inode(parent); } int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, struct buffer_head *block_bh, size_t value_len, bool is_create) { int credits; int blocks; /* * 1) Owner inode update * 2) Ref count update on old xattr block * 3) new xattr block * 4) block bitmap update for new xattr block * 5) group descriptor for new xattr block * 6) block bitmap update for old xattr block * 7) group descriptor for old block * * 6 & 7 can happen if we have two racing threads T_a and T_b * which are each trying to set an xattr on inodes I_a and I_b * which were both initially sharing an xattr block. */ credits = 7; /* Quota updates. */ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); /* * In case of inline data, we may push out the data to a block, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) credits += ext4_writepage_trans_blocks(inode) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) return credits; /* New ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks. */ blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; /* Blocks themselves. */ credits += blocks; if (!is_create) { /* Dereference ea_inode holding old xattr value. * Old ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks for old ea_inode. */ blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree for old * ea_inode. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; } /* We may need to clone the existing xattr block in which case we need * to increment ref counts for existing ea_inodes referenced by it. */ if (block_bh) { struct ext4_xattr_entry *entry = BFIRST(block_bh); for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) /* Ref count update on ea_inode. */ credits += 1; } return credits; } static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { struct ext4_iloc iloc; s64 ref_count; int ret; inode_lock(ea_inode); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); if (ret) goto out; ref_count = ext4_xattr_inode_get_ref(ea_inode); ref_count += ref_change; ext4_xattr_inode_set_ref(ea_inode, ref_count); if (ref_change > 0) { WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 1) { WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); set_nlink(ea_inode, 1); ext4_orphan_del(handle, ea_inode); } } else { WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); clear_nlink(ea_inode); ext4_orphan_add(handle, ea_inode); } } ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); if (ret) ext4_warning_inode(ea_inode, "ext4_mark_iloc_dirty() failed ret=%d", ret); out: inode_unlock(ea_inode); return ret; } static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, 1); } static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, -1); } static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, struct ext4_xattr_entry *first) { struct inode *ea_inode; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *failed_entry; unsigned int ea_ino; int err, saved_err; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) goto cleanup; err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "inc ref error %d", err); iput(ea_inode); goto cleanup; } iput(ea_inode); } return 0; cleanup: saved_err = err; failed_entry = entry; for (entry = first; entry != failed_entry; entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ext4_warning(parent->i_sb, "cleanup ea_ino %u iget error %d", ea_ino, err); continue; } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); } return saved_err; } static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, bool block_csum, bool dirty) { int error; if (bh && dirty) { if (block_csum) ext4_xattr_block_csum_set(inode, bh); error = ext4_handle_dirty_metadata(handle, NULL, bh); if (error) { ext4_warning(inode->i_sb, "Handle metadata (error %d)", error); return error; } } return 0; } static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, struct ext4_xattr_entry *first, bool block_csum, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits, bool skip_quota) { struct inode *ea_inode; struct ext4_xattr_entry *entry; bool dirty = false; unsigned int ea_ino; int err; int credits; /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) continue; err = ext4_expand_inode_array(ea_inode_array, ea_inode); if (err) { ext4_warning_inode(ea_inode, "Expand inode array err=%d", err); iput(ea_inode); continue; } err = ext4_journal_ensure_credits_fn(handle, credits, credits, ext4_free_metadata_revoke_credits(parent->i_sb, 1), ext4_xattr_restart_fn(handle, parent, bh, block_csum, dirty)); if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } if (err > 0) { err = ext4_journal_get_write_access(handle, parent->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_warning_inode(ea_inode, "Re-get write access err=%d", err); continue; } } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", err); continue; } if (!skip_quota) ext4_xattr_inode_free_quota(parent, ea_inode, le32_to_cpu(entry->e_value_size)); /* * Forget about ea_inode within the same transaction that * decrements the ref count. This avoids duplicate decrements in * case the rest of the work spills over to subsequent * transactions. */ entry->e_value_inum = 0; entry->e_value_size = 0; dirty = true; } if (dirty) { /* * Note that we are deliberately skipping csum calculation for * the final update because we do not expect any journal * restarts until xattr block is freed. */ err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) ext4_warning_inode(parent, "handle dirty metadata err=%d", err); } } /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (error) goto out; retry_ref: lock_buffer(bh); hash = le32_to_cpu(BHDR(bh)->h_hash); ref = le32_to_cpu(BHDR(bh)->h_refcount); if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bh->b_blocknr); if (oe) { unlock_buffer(bh); mb_cache_entry_wait_unused(oe); mb_cache_entry_put(ea_block_cache, oe); goto retry_ref; } } get_bh(bh); unlock_buffer(bh); if (ext4_has_feature_ea_inode(inode->i_sb)) ext4_xattr_inode_dec_ref_all(handle, inode, bh, BFIRST(bh), true /* block_csum */, ea_inode_array, extra_credits, true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { ref--; BHDR(bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; if (ea_block_cache) { ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { set_bit(MBE_REUSABLE_B, &ce->e_flags); mb_cache_entry_put(ea_block_cache, ce); } } } ext4_xattr_block_csum_set(inode, bh); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect * from a race where someone else frees the block (and releases * its journal_head) before we are done dirtying the buffer. In * nojournal mode this race is harmless and we actually cannot * call ext4_handle_dirty_metadata() with locked buffer as * that function can call sync_dirty_buffer() so for that case * we handle the dirtying after unlocking the buffer. */ if (ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); unlock_buffer(bh); if (!ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); } out: ext4_std_error(inode->i_sb, error); return; } /* * Find the available free space for EAs. This also returns the total number of * bytes used by EA entries. */ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } if (total) *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } /* * Write the value of the EA in an inode. */ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, const void *buf, int bufsize) { struct buffer_head *bh = NULL; unsigned long block = 0; int blocksize = ea_inode->i_sb->s_blocksize; int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; int ret = 0, ret2 = 0; int retries = 0; retry: while (ret >= 0 && ret < max_blocks) { struct ext4_map_blocks map; map.m_lblk = block += ret; map.m_len = max_blocks -= ret; ret = ext4_map_blocks(handle, ea_inode, &map, EXT4_GET_BLOCKS_CREATE); if (ret <= 0) { ext4_mark_inode_dirty(handle, ea_inode); if (ret == -ENOSPC && ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { ret = 0; goto retry; } break; } } if (ret < 0) return ret; block = 0; while (wsize < bufsize) { brelse(bh); csize = (bufsize - wsize) > blocksize ? blocksize : bufsize - wsize; bh = ext4_getblk(handle, ea_inode, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) { WARN_ON_ONCE(1); EXT4_ERROR_INODE(ea_inode, "ext4_getblk() return bh = NULL"); return -EFSCORRUPTED; } ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, EXT4_JTR_NONE); if (ret) goto out; memcpy(bh->b_data, buf, csize); set_buffer_uptodate(bh); ext4_handle_dirty_metadata(handle, ea_inode, bh); buf += csize; wsize += csize; block += 1; } inode_lock(ea_inode); i_size_write(ea_inode, wsize); ext4_update_i_disksize(ea_inode, wsize); inode_unlock(ea_inode); ret2 = ext4_mark_inode_dirty(handle, ea_inode); if (unlikely(ret2 && !ret)) ret = ret2; out: brelse(bh); return ret; } /* * Create an inode to store the value of a large EA. */ static struct inode *ext4_xattr_inode_create(handle_t *handle, struct inode *inode, u32 hash) { struct inode *ea_inode = NULL; uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; int err; if (inode->i_sb->s_root == NULL) { ext4_warning(inode->i_sb, "refuse to create EA inode when umounting"); WARN_ON(1); return ERR_PTR(-EINVAL); } /* * Let the next inode be the goal, so we try and allocate the EA inode * in the same group, or nearby one. */ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG | 0600, NULL, inode->i_ino + 1, owner, EXT4_EA_INODE_FL); if (!IS_ERR(ea_inode)) { ea_inode->i_op = &ext4_file_inode_operations; ea_inode->i_fop = &ext4_file_operations; ext4_set_aops(ea_inode); ext4_xattr_inode_set_class(ea_inode); unlock_new_inode(ea_inode); ext4_xattr_inode_set_ref(ea_inode, 1); ext4_xattr_inode_set_hash(ea_inode, hash); err = ext4_mark_inode_dirty(handle, ea_inode); if (!err) err = ext4_inode_attach_jinode(ea_inode); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return ERR_PTR(err); } /* * Xattr inodes are shared therefore quota charging is performed * at a higher level. */ dquot_free_inode(ea_inode); dquot_drop(ea_inode); inode_lock(ea_inode); ea_inode->i_flags |= S_NOQUOTA; inode_unlock(ea_inode); } return ea_inode; } static struct inode * ext4_xattr_inode_cache_find(struct inode *inode, const void *value, size_t value_len, u32 hash) { struct inode *ea_inode; struct mb_cache_entry *ce; struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); void *ea_data; if (!ea_inode_cache) return NULL; ce = mb_cache_entry_find_first(ea_inode_cache, hash); if (!ce) return NULL; WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && !(current->flags & PF_MEMALLOC_NOFS)); ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; } while (ce) { ea_inode = ext4_iget(inode->i_sb, ce->e_value, EXT4_IGET_EA_INODE); if (IS_ERR(ea_inode)) goto next_entry; ext4_xattr_inode_set_class(ea_inode); if (i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, value_len) && !memcmp(value, ea_data, value_len)) { mb_cache_entry_touch(ea_inode_cache, ce); mb_cache_entry_put(ea_inode_cache, ce); kvfree(ea_data); return ea_inode; } iput(ea_inode); next_entry: ce = mb_cache_entry_find_next(ea_inode_cache, ce); } kvfree(ea_data); return NULL; } /* * Add value of the EA in an inode. */ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, const void *value, size_t value_len, struct inode **ret_inode) { struct inode *ea_inode; u32 hash; int err; hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { iput(ea_inode); return err; } *ret_inode = ea_inode; return 0; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); if (IS_ERR(ea_inode)) return PTR_ERR(ea_inode); err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); *ret_inode = ea_inode; return 0; } /* * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode * feature is enabled. */ #define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, bool is_block) { struct ext4_xattr_entry *last, *next; struct ext4_xattr_entry *here = s->here; size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; struct inode *new_ea_inode = NULL; size_t old_size, new_size; int ret; /* Space used by old and new values. */ old_size = (!s->not_found && !here->e_value_inum) ? EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; /* * Optimization for the simple case when old and new values have the * same padded sizes. Not applicable if external inodes are involved. */ if (new_size && new_size == old_size) { size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; here->e_value_size = cpu_to_le32(i->value_len); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } goto update_hash; } /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = next) { next = EXT4_XATTR_NEXT(last); if ((void *)next >= s->end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); ret = -EFSCORRUPTED; goto out; } if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; } } /* Check whether we have enough space. */ if (i->value) { size_t free; free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) free += EXT4_XATTR_LEN(name_len) + old_size; if (free < EXT4_XATTR_LEN(name_len) + new_size) { ret = -ENOSPC; goto out; } /* * If storing the value in an external inode is an option, * reserve space for xattr entries/names in the external * attribute block so that a long value does not occupy the * whole space and prevent further entries being added. */ if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && is_block && (min_offs + old_size - new_size) < EXT4_XATTR_BLOCK_RESERVE(inode)) { ret = -ENOSPC; goto out; } } /* * Getting access to old and new ea inodes is subject to failures. * Finish that work before doing any modifications to the xattr data. */ if (!s->not_found && here->e_value_inum) { ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), le32_to_cpu(here->e_hash), &old_ea_inode); if (ret) { old_ea_inode = NULL; goto out; } } if (i->value && in_inode) { WARN_ON_ONCE(!i->value_len); ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); if (ret) goto out; ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, i->value_len, &new_ea_inode); if (ret) { new_ea_inode = NULL; ext4_xattr_inode_free_quota(inode, NULL, i->value_len); goto out; } } if (old_ea_inode) { /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); if (ret) { /* Release newly required ref count on new_ea_inode. */ if (new_ea_inode) { int err; err = ext4_xattr_inode_dec_ref(handle, new_ea_inode); if (err) ext4_warning_inode(new_ea_inode, "dec ref new_ea_inode err=%d", err); ext4_xattr_inode_free_quota(inode, new_ea_inode, i->value_len); } goto out; } ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); } /* No failures allowed past this point. */ if (!s->not_found && here->e_value_size && !here->e_value_inum) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; memmove(first_val + old_size, first_val, val - first_val); memset(first_val, 0, old_size); min_offs += old_size; /* Adjust all value offsets. */ last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); if (!last->e_value_inum && last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + old_size); last = EXT4_XATTR_NEXT(last); } } if (!i->value) { /* Remove old name. */ size_t size = EXT4_XATTR_LEN(name_len); last = ENTRY((void *)last - size); memmove(here, (void *)here + size, (void *)last - (void *)here + sizeof(__u32)); memset(last, 0, size); /* * Update i_inline_off - moved ibody region might contain * system.data attribute. Handling a failure here won't * cause other complications for setting an xattr. */ if (!is_block && ext4_has_inline_data(inode)) { ret = ext4_find_inline_data_nolock(inode); if (ret) { ext4_warning_inode(inode, "unable to update i_inline_off"); goto out; } } } else if (s->not_found) { /* Insert new name. */ size_t size = EXT4_XATTR_LEN(name_len); size_t rest = (void *)last - (void *)here + sizeof(__u32); memmove((void *)here + size, here, rest); memset(here, 0, size); here->e_name_index = i->name_index; here->e_name_len = name_len; memcpy(here->e_name, i->name, name_len); } else { /* This is an update, reset value info. */ here->e_value_inum = 0; here->e_value_offs = 0; here->e_value_size = 0; } if (i->value) { /* Insert new value. */ if (in_inode) { here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); } else if (i->value_len) { void *val = s->base + min_offs - new_size; here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } } here->e_value_size = cpu_to_le32(i->value_len); } update_hash: if (i->value) { __le32 hash = 0; /* Entry hash calculation. */ if (in_inode) { __le32 crc32c_hash; /* * Feed crc32c hash instead of the raw value for entry * hash calculation. This is to avoid walking * potentially long value buffer again. */ crc32c_hash = cpu_to_le32( ext4_xattr_inode_get_hash(new_ea_inode)); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, &crc32c_hash, 1); } else if (is_block) { __le32 *value = s->base + le16_to_cpu( here->e_value_offs); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, value, new_size >> 2); } here->e_hash = hash; } if (is_block) ext4_xattr_rehash((struct ext4_xattr_header *)s->base); ret = 0; out: iput(old_ea_inode); iput(new_ea_inode); return ret; } struct ext4_xattr_block_find { struct ext4_xattr_search s; struct buffer_head *bh; }; static int ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; int error; ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", i->name_index, i->name, i->value, (long)i->value_len); if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bs->bh)) { error = PTR_ERR(bs->bh); bs->bh = NULL; return error; } ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) return error; bs->s.not_found = error; } return 0; } static int ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search s_copy = bs->s; struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); struct inode *ea_inode = NULL, *tmp_inode; size_t old_ea_inode_quota = 0; unsigned int ea_ino; #define header(x) ((struct ext4_xattr_header *)(x)) if (s->base) { int offset = (char *)s->here - bs->bh->b_data; BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, sb, bs->bh, EXT4_JTR_NONE); if (error) goto cleanup; lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect modified * block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bs->bh->b_blocknr); if (oe) { /* * Xattr block is getting reused. Leave * it alone. */ mb_cache_entry_put(ea_block_cache, oe); goto clone_block; } } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) goto bad_block; if (!error) error = ext4_handle_dirty_metadata(handle, inode, bs->bh); if (error) goto cleanup; goto inserted; } clone_block: unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; s->first = ENTRY(header(s->base)+1); header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; /* * If existing entry points to an xattr inode, we need * to prevent ext4_xattr_set_entry() from decrementing * ref count on it because the reference belongs to the * original block. In this case, make the entry look * like it has an empty value. */ if (!s->not_found && s->here->e_value_inum) { ea_ino = le32_to_cpu(s->here->e_value_inum); error = ext4_xattr_inode_iget(inode, ea_ino, le32_to_cpu(s->here->e_hash), &tmp_inode); if (error) goto cleanup; if (!ext4_test_inode_state(tmp_inode, EXT4_STATE_LUSTRE_EA_INODE)) { /* * Defer quota free call for previous * inode until success is guaranteed. */ old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); } iput(tmp_inode); s->here->e_value_inum = 0; s->here->e_value_size = 0; } } else { /* Allocate a buffer where we construct the new block. */ s->base = kzalloc(sb->s_blocksize, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); header(s->base)->h_blocks = cpu_to_le32(1); header(s->base)->h_refcount = cpu_to_le32(1); s->first = ENTRY(header(s->base)+1); s->here = ENTRY(header(s->base)+1); s->end = s->base + sb->s_blocksize; } error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; if (i->value && s->here->e_value_inum) { /* * A ref count on ea_inode has been taken as part of the call to * ext4_xattr_set_entry() above. We would like to drop this * extra ref but we have to wait until the xattr block is * initialized and has its own ref count on the ea_inode. */ ea_ino = le32_to_cpu(s->here->e_value_inum); error = ext4_xattr_inode_iget(inode, ea_ino, le32_to_cpu(s->here->e_hash), &ea_inode); if (error) { ea_inode = NULL; goto cleanup; } } inserted: if (!IS_LAST_ENTRY(s->first)) { new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { u32 ref; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; BUFFER_TRACE(new_bh, "get_write_access"); error = ext4_journal_get_write_access( handle, sb, new_bh, EXT4_JTR_NONE); if (error) goto cleanup_dquot; lock_buffer(new_bh); /* * We have to be careful about races with * adding references to xattr block. Once we * hold buffer lock xattr block's state is * stable so we can check the additional * reference fits. */ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; if (ref > EXT4_XATTR_REFCOUNT_MAX) { /* * Undo everything and check mbcache * again. */ unlock_buffer(new_bh); dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; } BHDR(new_bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX) clear_bit(MBE_REUSABLE_B, &ce->e_flags); ea_bdebug(new_bh, "reusing; refcount now=%d", ref); ext4_xattr_block_csum_set(inode, new_bh); unlock_buffer(new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup_dquot; } mb_cache_entry_touch(ea_block_cache, ce); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ ea_bdebug(bs->bh, "keeping this block"); ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); new_bh = bs->bh; get_bh(new_bh); } else { /* We need to allocate a new block */ ext4_fsblk_t goal, block; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %llu", (unsigned long long)block); new_bh = sb_getblk(sb, block); if (unlikely(!new_bh)) { error = -ENOMEM; getblk_failed: ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); goto cleanup; } error = ext4_xattr_inode_inc_ref_all(handle, inode, ENTRY(header(s->base)+1)); if (error) goto getblk_failed; if (ea_inode) { /* Drop the extra ref on ea_inode. */ error = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error) ext4_warning_inode(ea_inode, "dec ref error=%d", error); iput(ea_inode); ea_inode = NULL; } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, sb, new_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(new_bh); error = -EIO; goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup; } } if (old_ea_inode_quota) ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ if (bs->bh && bs->bh != new_bh) { struct ext4_xattr_inode_array *ea_inode_array = NULL; ext4_xattr_release_block(handle, inode, bs->bh, &ea_inode_array, 0 /* extra_credits */); ext4_xattr_inode_array_free(ea_inode_array); } error = 0; cleanup: if (ea_inode) { int error2; error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error2) ext4_warning_inode(ea_inode, "dec ref error=%d", error2); /* If there was an error, revert the quota charge. */ if (error) ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); iput(ea_inode); } if (ce) mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); return error; cleanup_dquot: dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); goto cleanup; bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); goto cleanup; #undef header } int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; raw_inode = ext4_raw_inode(&is->iloc); header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = xattr_check_inode(inode, header, is->s.end); if (error) return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; } return 0; } int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); ext4_set_inode_state(inode, EXT4_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } return 0; } static int ext4_xattr_value_same(struct ext4_xattr_search *s, struct ext4_xattr_info *i) { void *value; /* When e_value_inum is set the value is stored externally. */ if (s->here->e_value_inum) return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); return !memcmp(value, i->value, i->value_len); } static struct buffer_head *ext4_xattr_get_block(struct inode *inode) { struct buffer_head *bh; int error; if (!EXT4_I(inode)->i_file_acl) return NULL; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); return ERR_PTR(error); } return bh; } /* * ext4_xattr_set_handle() * * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE * specify that an extended attribute must exist and must not exist * previous to the call, respectively. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { struct ext4_xattr_info i = { .name_index = name_index, .name = name, .value = value, .value_len = value_len, .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; ext4_write_lock_xattr(inode, &no_expand); /* Check journal credits under write lock. */ if (ext4_handle_valid(handle)) { struct buffer_head *bh; int credits; bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, flags & XATTR_CREATE); brelse(bh); if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_clear_inode_state(inode, EXT4_STATE_NEW); } error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto cleanup; if (is.s.not_found) error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; if (is.s.not_found && bs.s.not_found) { error = -ENODATA; if (flags & XATTR_REPLACE) goto cleanup; error = 0; if (!value) goto cleanup; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto cleanup; } if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { error = 0; /* Xattr value did not change? Save us some work and bail out */ if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) goto cleanup; if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb) && (EXT4_XATTR_SIZE(i.value_len) > EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) i.in_inode = 1; retry_inode: error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); } else if (error == -ENOSPC) { if (EXT4_I(inode)->i_file_acl && !bs.s.base) { brelse(bs.bh); bs.bh = NULL; error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); if (!error && !is.s.not_found) { i.value = NULL; error = ext4_xattr_ibody_set(handle, inode, &i, &is); } else if (error == -ENOSPC) { /* * Xattr does not fit in the block, store at * external inode if possible. */ if (ext4_has_feature_ea_inode(inode->i_sb) && i.value_len && !i.in_inode) { i.in_inode = 1; goto retry_inode; } } } } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode_set_ctime_current(inode); inode_inc_iversion(inode); if (!value) no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with * error != 0. */ is.iloc.bh = NULL; if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); brelse(bs.bh); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_xattr_set_credits(struct inode *inode, size_t value_len, bool is_create, int *credits) { struct buffer_head *bh; int err; *credits = 0; if (!EXT4_SB(inode->i_sb)->s_journal) return 0; down_read(&EXT4_I(inode)->xattr_sem); bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { err = PTR_ERR(bh); } else { *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, is_create); brelse(bh); err = 0; } up_read(&EXT4_I(inode)->xattr_sem); return err; } /* * ext4_xattr_set() * * Like ext4_xattr_set_handle, but start from an inode. This extended * attribute modification is a filesystem transaction by itself. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; struct super_block *sb = inode->i_sb; int error, retries = 0; int credits; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { int error2; error = ext4_xattr_set_handle(handle, inode, name_index, name, value, value_len, flags); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); return error; } /* * Shift the EA entries in the inode to create space for the increased * i_extra_isize. */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; /* We always shift xattr headers further thus offsets get lower */ BUG_ON(value_offs_shift > 0); /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); } } /* Shift the entries by n bytes */ memmove(to, from, n); } /* * Move xattr pointed to by 'entry' from inode into external xattr block */ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, struct ext4_xattr_entry *entry) { struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int needs_kvfree = 0; int error; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); if (!is || !bs || !b_entry_name) { error = -ENOMEM; goto out; } is->s.not_found = -ENODATA; bs->s.not_found = -ENODATA; is->iloc.bh = NULL; bs->bh = NULL; /* Save the entry name and the entry value */ if (entry->e_value_inum) { buffer = kvmalloc(value_size, GFP_NOFS); if (!buffer) { error = -ENOMEM; goto out; } needs_kvfree = 1; error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; } else { size_t value_offs = le16_to_cpu(entry->e_value_offs); buffer = (void *)IFIRST(header) + value_offs; } memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; error = ext4_get_inode_loc(inode, &is->iloc); if (error) goto out; error = ext4_xattr_ibody_find(inode, &i, is); if (error) goto out; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); if (error) goto out; /* Move ea entry from the inode into the block */ error = ext4_xattr_block_set(handle, inode, &i, bs); if (error) goto out; /* Remove the chosen entry from the inode */ i.value = NULL; i.value_len = 0; error = ext4_xattr_ibody_set(handle, inode, &i, is); out: kfree(b_entry_name); if (needs_kvfree && buffer) kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) brelse(bs->bh); kfree(is); kfree(bs); return error; } static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, int isize_diff, size_t ifree, size_t bfree, int *total_ino) { struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); struct ext4_xattr_entry *small_entry; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *last; unsigned int entry_size; /* EA entry size */ unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size; int error; while (isize_diff > ifree) { entry = NULL; small_entry = NULL; min_total_size = ~0U; last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { /* never move system.data out of the inode */ if ((last->e_name_len == 4) && (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && !memcmp(last->e_name, "data", 4)) continue; total_size = EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { small_entry = last; } else { entry = last; min_total_size = total_size; } } } if (entry == NULL) { if (small_entry == NULL) return -ENOSPC; entry = small_entry; } entry_size = EXT4_XATTR_LEN(entry->e_name_len); total_size = entry_size; if (!entry->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) return error; *total_ino -= entry_size; ifree += total_size; bfree -= total_size; } return 0; } /* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. */ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); static unsigned int mnt_count; size_t min_offs; size_t ifree, bfree; int total_ino; void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; header = IHDR(inode, raw_inode); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); error = xattr_check_inode(inode, header, end); if (error) goto cleanup; ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; /* * Enough free space isn't available in the inode, check if * EA block can hold new_extra_isize bytes. */ if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); goto cleanup; } base = BHDR(bh); end = bh->b_data + bh->b_size; min_offs = end - base; bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, NULL); brelse(bh); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } error = -ENOSPC; goto cleanup; } } else { bfree = inode->i_sb->s_blocksize; } error = ext4_xattr_make_inode_space(handle, inode, raw_inode, isize_diff, ifree, bfree, &total_ino); if (error) { if (error == -ENOSPC && !tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } goto cleanup; } shift: /* Adjust the offsets and shift the remaining entries ahead */ ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; if (ext4_has_inline_data(inode)) error = ext4_find_inline_data_nolock(inode); cleanup: if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", inode->i_ino); mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); } return error; } #define EIA_INCR 16 /* must be 2^n */ #define EIA_MASK (EIA_INCR - 1) /* Add the large xattr @inode into @ea_inode_array for deferred iput(). * If @ea_inode_array is new or full it will be grown and the old * contents copied over. */ static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode) { if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. * If *ea_inode_array is NULL, this is essentially offsetof() */ (*ea_inode_array) = kmalloc(offsetof(struct ext4_xattr_inode_array, inodes[EIA_MASK]), GFP_NOFS); if (*ea_inode_array == NULL) return -ENOMEM; (*ea_inode_array)->count = 0; } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ struct ext4_xattr_inode_array *new_array = NULL; int count = (*ea_inode_array)->count; /* if new_array is NULL, this is essentially offsetof() */ new_array = kmalloc( offsetof(struct ext4_xattr_inode_array, inodes[count + EIA_INCR]), GFP_NOFS); if (new_array == NULL) return -ENOMEM; memcpy(new_array, *ea_inode_array, offsetof(struct ext4_xattr_inode_array, inodes[count])); kfree(*ea_inode_array); *ea_inode_array = new_array; } (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; return 0; } /* * ext4_xattr_delete_inode() * * Free extended attribute resources associated with this inode. Traverse * all entries and decrement reference on any xattr inodes associated with this * inode. This is called immediately before an inode is freed. We have exclusive * access to the inode. If an orphan inode is deleted it will also release its * references on xattr block and xattr inodes. */ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct buffer_head *bh = NULL; struct ext4_xattr_ibody_header *header; struct ext4_iloc iloc = { .bh = NULL }; struct ext4_xattr_entry *entry; struct inode *ea_inode; int error; error = ext4_journal_ensure_credits(handle, extra_credits, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } if (ext4_has_feature_ea_inode(inode->i_sb) && ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = ext4_get_inode_loc(inode, &iloc); if (error) { EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); goto cleanup; } error = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (error) { EXT4_ERROR_INODE(inode, "write access (error %d)", error); goto cleanup; } header = IHDR(inode, ext4_raw_inode(&iloc)); if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, IFIRST(header), false /* block_csum */, ea_inode_array, extra_credits, false /* skip_quota */); } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); if (error == -EIO) { EXT4_ERROR_INODE_ERR(inode, EIO, "block %llu read error", EXT4_I(inode)->i_file_acl); } bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; error = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (error) continue; ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); iput(ea_inode); } } ext4_xattr_release_block(handle, inode, bh, ea_inode_array, extra_credits); /* * Update i_file_acl value in the same transaction that releases * block. */ EXT4_I(inode)->i_file_acl = 0; error = ext4_mark_inode_dirty(handle, inode); if (error) { EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", error); goto cleanup; } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: brelse(iloc.bh); brelse(bh); return error; } void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) { int idx; if (ea_inode_array == NULL) return; for (idx = 0; idx < ea_inode_array->count; ++idx) iput(ea_inode_array->inodes[idx]); kfree(ea_inode_array); } /* * ext4_xattr_block_cache_insert() * * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. * * Returns 0, or a negative error number on failure. */ static void ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); int reusable = le32_to_cpu(header->h_refcount) < EXT4_XATTR_REFCOUNT_MAX; int error; if (!ea_block_cache) return; error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); } else ea_bdebug(bh, "inserting [%x]", (int)hash); } /* * ext4_xattr_cmp() * * Compare two extended attribute blocks for equality. * * Returns 0 if the blocks are equal, 1 if they differ, and * a negative error number on errors. */ static int ext4_xattr_cmp(struct ext4_xattr_header *header1, struct ext4_xattr_header *header2) { struct ext4_xattr_entry *entry1, *entry2; entry1 = ENTRY(header1+1); entry2 = ENTRY(header2+1); while (!IS_LAST_ENTRY(entry1)) { if (IS_LAST_ENTRY(entry2)) return 1; if (entry1->e_hash != entry2->e_hash || entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; if (!entry1->e_value_inum && memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; entry1 = EXT4_XATTR_NEXT(entry1); entry2 = EXT4_XATTR_NEXT(entry2); } if (!IS_LAST_ENTRY(entry2)) return 1; return 0; } /* * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * * Returns a pointer to the block found, or NULL if such a block was * not found or an error occurred. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, struct ext4_xattr_header *header, struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!ea_block_cache) return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { if (PTR_ERR(bh) == -ENOMEM) return NULL; bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } #define NAME_HASH_SHIFT 5 #define VALUE_HASH_SHIFT 16 /* * ext4_xattr_hash_entry() * * Compute the hash of an extended attribute. */ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count) { __u32 hash = 0; while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ (unsigned char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ le32_to_cpu(*value++); } return cpu_to_le32(hash); } /* * ext4_xattr_hash_entry_signed() * * Compute the hash of an extended attribute incorrectly. */ static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count) { __u32 hash = 0; while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ (signed char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ le32_to_cpu(*value++); } return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT #undef VALUE_HASH_SHIFT #define BLOCK_HASH_SHIFT 16 /* * ext4_xattr_rehash() * * Re-compute the extended attribute hash value after an entry has changed. */ static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { /* Block is not shared if an entry's hash value == 0 */ hash = 0; break; } hash = (hash << BLOCK_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ le32_to_cpu(here->e_hash); here = EXT4_XATTR_NEXT(here); } header->h_hash = cpu_to_le32(hash); } #undef BLOCK_HASH_SHIFT #define HASH_BUCKET_BITS 10 struct mb_cache * ext4_xattr_create_cache(void) { return mb_cache_create(HASH_BUCKET_BITS); } void ext4_xattr_destroy_cache(struct mb_cache *cache) { if (cache) mb_cache_destroy(cache); } |
| 111 111 111 111 111 379 379 111 111 111 111 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 | // SPDX-License-Identifier: GPL-2.0 /* * SafeSetID Linux Security Module * * Author: Micah Morton <mortonm@chromium.org> * * Copyright (C) 2018 The Chromium OS Authors. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) "SafeSetID: " fmt #include <linux/lsm_hooks.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <uapi/linux/lsm.h> #include "lsm.h" /* Flag indicating whether initialization completed */ int safesetid_initialized __initdata; struct setid_ruleset __rcu *safesetid_setuid_rules; struct setid_ruleset __rcu *safesetid_setgid_rules; /* Compute a decision for a transition from @src to @dst under @policy. */ enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy, kid_t src, kid_t dst) { struct setid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; if (policy->type == UID) { hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) { if (!uid_eq(rule->src_id.uid, src.uid)) continue; if (uid_eq(rule->dst_id.uid, dst.uid)) return SIDPOL_ALLOWED; result = SIDPOL_CONSTRAINED; } } else if (policy->type == GID) { hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) { if (!gid_eq(rule->src_id.gid, src.gid)) continue; if (gid_eq(rule->dst_id.gid, dst.gid)){ return SIDPOL_ALLOWED; } result = SIDPOL_CONSTRAINED; } } else { /* Should not reach here, report the ID as contrainsted */ result = SIDPOL_CONSTRAINED; } return result; } /* * Compute a decision for a transition from @src to @dst under the active * policy. */ static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type) { enum sid_policy_type result = SIDPOL_DEFAULT; struct setid_ruleset *pol; rcu_read_lock(); if (new_type == UID) pol = rcu_dereference(safesetid_setuid_rules); else if (new_type == GID) pol = rcu_dereference(safesetid_setgid_rules); else { /* Should not reach here */ result = SIDPOL_CONSTRAINED; rcu_read_unlock(); return result; } if (pol) { pol->type = new_type; result = _setid_policy_lookup(pol, src, dst); } rcu_read_unlock(); return result; } static int safesetid_security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { /* We're only interested in CAP_SETUID and CAP_SETGID. */ if (cap != CAP_SETUID && cap != CAP_SETGID) return 0; /* * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we * want to let it go through here; the real security check happens later, in * the task_fix_set{u/g}id or task_fix_setgroups hooks. */ if ((opts & CAP_OPT_INSETID) != 0) return 0; switch (cap) { case CAP_SETUID: /* * If no policy applies to this task, allow the use of CAP_SETUID for * other purposes. */ if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*uid() (e.g. setting up userns uid mappings). */ pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for * other purposes. */ if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*gid() (e.g. setting up userns gid mappings). */ pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kgid_val(cred->gid)); return -EPERM; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; } return 0; } /* * Check whether a caller with old credentials @old is allowed to switch to * credentials that contain @new_id. */ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type) { bool permitted; /* If our old creds already had this ID in it, it's fine. */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; } else /* Error, new_type is an invalid type */ return false; /* * Transitions to new UIDs require a check against the policy of the old * RUID. */ permitted = setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; if (!permitted) { if (new_type == UID) { pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", __kuid_val(old->uid), __kuid_val(old->euid), __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID) { pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); } else /* Error, new_type is an invalid type */ return false; } return permitted; } /* * Check whether there is either an exception for user under old cred struct to * set*uid to user under new cred struct, or the UID transition is allowed (by * Linux set*uid rules) even without CAP_SETUID. */ static int safesetid_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setuid restrictions for our old RUID. */ if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old) { int i; /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; get_group_info(new->group_info); for (i = 0; i < new->group_info->ngroups; i++) { if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) { put_group_info(new->group_info); /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } } put_group_info(new->group_info); return 0; } static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups), LSM_HOOK_INIT(capable, safesetid_security_capable) }; static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, ARRAY_SIZE(safesetid_security_hooks), &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; return 0; } DEFINE_LSM(safesetid_security_init) = { .init = safesetid_security_init, .name = "safesetid", }; |
| 16 16 16 2 2 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | // SPDX-License-Identifier: GPL-2.0-only /* * Sync File validation framework and debug information * * Copyright (C) 2012 Google, Inc. */ #include <linux/debugfs.h> #include "sync_debug.h" static struct dentry *dbgfs; static LIST_HEAD(sync_timeline_list_head); static DEFINE_SPINLOCK(sync_timeline_list_lock); static LIST_HEAD(sync_file_list_head); static DEFINE_SPINLOCK(sync_file_list_lock); void sync_timeline_debug_add(struct sync_timeline *obj) { unsigned long flags; spin_lock_irqsave(&sync_timeline_list_lock, flags); list_add_tail(&obj->sync_timeline_list, &sync_timeline_list_head); spin_unlock_irqrestore(&sync_timeline_list_lock, flags); } void sync_timeline_debug_remove(struct sync_timeline *obj) { unsigned long flags; spin_lock_irqsave(&sync_timeline_list_lock, flags); list_del(&obj->sync_timeline_list); spin_unlock_irqrestore(&sync_timeline_list_lock, flags); } void sync_file_debug_add(struct sync_file *sync_file) { unsigned long flags; spin_lock_irqsave(&sync_file_list_lock, flags); list_add_tail(&sync_file->sync_file_list, &sync_file_list_head); spin_unlock_irqrestore(&sync_file_list_lock, flags); } void sync_file_debug_remove(struct sync_file *sync_file) { unsigned long flags; spin_lock_irqsave(&sync_file_list_lock, flags); list_del(&sync_file->sync_file_list); spin_unlock_irqrestore(&sync_file_list_lock, flags); } static const char *sync_status_str(int status) { if (status < 0) return "error"; if (status > 0) return "signaled"; return "active"; } static void sync_print_fence(struct seq_file *s, struct dma_fence *fence, bool show) { struct sync_timeline *parent = dma_fence_parent(fence); int status; status = dma_fence_get_status_locked(fence); seq_printf(s, " %s%sfence %s", show ? parent->name : "", show ? "_" : "", sync_status_str(status)); if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags)) { struct timespec64 ts64 = ktime_to_timespec64(fence->timestamp); seq_printf(s, "@%lld.%09ld", (s64)ts64.tv_sec, ts64.tv_nsec); } if (fence->ops->timeline_value_str && fence->ops->fence_value_str) { char value[64]; bool success; fence->ops->fence_value_str(fence, value, sizeof(value)); success = strlen(value); if (success) { seq_printf(s, ": %s", value); fence->ops->timeline_value_str(fence, value, sizeof(value)); if (strlen(value)) seq_printf(s, " / %s", value); } } seq_putc(s, '\n'); } static void sync_print_obj(struct seq_file *s, struct sync_timeline *obj) { struct list_head *pos; seq_printf(s, "%s: %d\n", obj->name, obj->value); spin_lock_irq(&obj->lock); list_for_each(pos, &obj->pt_list) { struct sync_pt *pt = container_of(pos, struct sync_pt, link); sync_print_fence(s, &pt->base, false); } spin_unlock_irq(&obj->lock); } static void sync_print_sync_file(struct seq_file *s, struct sync_file *sync_file) { char buf[128]; int i; seq_printf(s, "[%p] %s: %s\n", sync_file, sync_file_get_name(sync_file, buf, sizeof(buf)), sync_status_str(dma_fence_get_status(sync_file->fence))); if (dma_fence_is_array(sync_file->fence)) { struct dma_fence_array *array = to_dma_fence_array(sync_file->fence); for (i = 0; i < array->num_fences; ++i) sync_print_fence(s, array->fences[i], true); } else { sync_print_fence(s, sync_file->fence, true); } } static int sync_info_debugfs_show(struct seq_file *s, void *unused) { struct list_head *pos; seq_puts(s, "objs:\n--------------\n"); spin_lock_irq(&sync_timeline_list_lock); list_for_each(pos, &sync_timeline_list_head) { struct sync_timeline *obj = container_of(pos, struct sync_timeline, sync_timeline_list); sync_print_obj(s, obj); seq_putc(s, '\n'); } spin_unlock_irq(&sync_timeline_list_lock); seq_puts(s, "fences:\n--------------\n"); spin_lock_irq(&sync_file_list_lock); list_for_each(pos, &sync_file_list_head) { struct sync_file *sync_file = container_of(pos, struct sync_file, sync_file_list); sync_print_sync_file(s, sync_file); seq_putc(s, '\n'); } spin_unlock_irq(&sync_file_list_lock); return 0; } DEFINE_SHOW_ATTRIBUTE(sync_info_debugfs); static __init int sync_debugfs_init(void) { dbgfs = debugfs_create_dir("sync", NULL); /* * The debugfs files won't ever get removed and thus, there is * no need to protect it against removal races. The use of * debugfs_create_file_unsafe() is actually safe here. */ debugfs_create_file_unsafe("info", 0444, dbgfs, NULL, &sync_info_debugfs_fops); debugfs_create_file_unsafe("sw_sync", 0644, dbgfs, NULL, &sw_sync_debugfs_fops); return 0; } late_initcall(sync_debugfs_init); |
| 45 1776 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2020 SiFive */ #ifndef __ASM_RISCV_VECTOR_H #define __ASM_RISCV_VECTOR_H #include <linux/types.h> #include <uapi/asm-generic/errno.h> #ifdef CONFIG_RISCV_ISA_V #include <linux/stringify.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <asm/ptrace.h> #include <asm/cpufeature.h> #include <asm/csr.h> #include <asm/asm.h> extern unsigned long riscv_v_vsize; int riscv_v_setup_vsize(void); bool riscv_v_first_use_handler(struct pt_regs *regs); void kernel_vector_begin(void); void kernel_vector_end(void); void get_cpu_vector_context(void); void put_cpu_vector_context(void); void riscv_v_thread_free(struct task_struct *tsk); void __init riscv_v_setup_ctx_cache(void); void riscv_v_thread_alloc(struct task_struct *tsk); static inline u32 riscv_v_flags(void) { return READ_ONCE(current->thread.riscv_v_flags); } static __always_inline bool has_vector(void) { return riscv_has_extension_unlikely(RISCV_ISA_EXT_v); } static inline void __riscv_v_vstate_clean(struct pt_regs *regs) { regs->status = (regs->status & ~SR_VS) | SR_VS_CLEAN; } static inline void __riscv_v_vstate_dirty(struct pt_regs *regs) { regs->status = (regs->status & ~SR_VS) | SR_VS_DIRTY; } static inline void riscv_v_vstate_off(struct pt_regs *regs) { regs->status = (regs->status & ~SR_VS) | SR_VS_OFF; } static inline void riscv_v_vstate_on(struct pt_regs *regs) { regs->status = (regs->status & ~SR_VS) | SR_VS_INITIAL; } static inline bool riscv_v_vstate_query(struct pt_regs *regs) { return (regs->status & SR_VS) != 0; } static __always_inline void riscv_v_enable(void) { csr_set(CSR_SSTATUS, SR_VS); } static __always_inline void riscv_v_disable(void) { csr_clear(CSR_SSTATUS, SR_VS); } static __always_inline void __vstate_csr_save(struct __riscv_v_ext_state *dest) { asm volatile ( "csrr %0, " __stringify(CSR_VSTART) "\n\t" "csrr %1, " __stringify(CSR_VTYPE) "\n\t" "csrr %2, " __stringify(CSR_VL) "\n\t" "csrr %3, " __stringify(CSR_VCSR) "\n\t" "csrr %4, " __stringify(CSR_VLENB) "\n\t" : "=r" (dest->vstart), "=r" (dest->vtype), "=r" (dest->vl), "=r" (dest->vcsr), "=r" (dest->vlenb) : :); } static __always_inline void __vstate_csr_restore(struct __riscv_v_ext_state *src) { asm volatile ( ".option push\n\t" ".option arch, +v\n\t" "vsetvl x0, %2, %1\n\t" ".option pop\n\t" "csrw " __stringify(CSR_VSTART) ", %0\n\t" "csrw " __stringify(CSR_VCSR) ", %3\n\t" : : "r" (src->vstart), "r" (src->vtype), "r" (src->vl), "r" (src->vcsr) :); } static inline void __riscv_v_vstate_save(struct __riscv_v_ext_state *save_to, void *datap) { unsigned long vl; riscv_v_enable(); __vstate_csr_save(save_to); asm volatile ( ".option push\n\t" ".option arch, +v\n\t" "vsetvli %0, x0, e8, m8, ta, ma\n\t" "vse8.v v0, (%1)\n\t" "add %1, %1, %0\n\t" "vse8.v v8, (%1)\n\t" "add %1, %1, %0\n\t" "vse8.v v16, (%1)\n\t" "add %1, %1, %0\n\t" "vse8.v v24, (%1)\n\t" ".option pop\n\t" : "=&r" (vl) : "r" (datap) : "memory"); riscv_v_disable(); } static inline void __riscv_v_vstate_restore(struct __riscv_v_ext_state *restore_from, void *datap) { unsigned long vl; riscv_v_enable(); asm volatile ( ".option push\n\t" ".option arch, +v\n\t" "vsetvli %0, x0, e8, m8, ta, ma\n\t" "vle8.v v0, (%1)\n\t" "add %1, %1, %0\n\t" "vle8.v v8, (%1)\n\t" "add %1, %1, %0\n\t" "vle8.v v16, (%1)\n\t" "add %1, %1, %0\n\t" "vle8.v v24, (%1)\n\t" ".option pop\n\t" : "=&r" (vl) : "r" (datap) : "memory"); __vstate_csr_restore(restore_from); riscv_v_disable(); } static inline void __riscv_v_vstate_discard(void) { unsigned long vl, vtype_inval = 1UL << (BITS_PER_LONG - 1); riscv_v_enable(); asm volatile ( ".option push\n\t" ".option arch, +v\n\t" "vsetvli %0, x0, e8, m8, ta, ma\n\t" "vmv.v.i v0, -1\n\t" "vmv.v.i v8, -1\n\t" "vmv.v.i v16, -1\n\t" "vmv.v.i v24, -1\n\t" "vsetvl %0, x0, %1\n\t" ".option pop\n\t" : "=&r" (vl) : "r" (vtype_inval) : "memory"); riscv_v_disable(); } static inline void riscv_v_vstate_discard(struct pt_regs *regs) { if ((regs->status & SR_VS) == SR_VS_OFF) return; __riscv_v_vstate_discard(); __riscv_v_vstate_dirty(regs); } static inline void riscv_v_vstate_save(struct __riscv_v_ext_state *vstate, struct pt_regs *regs) { if ((regs->status & SR_VS) == SR_VS_DIRTY) { __riscv_v_vstate_save(vstate, vstate->datap); __riscv_v_vstate_clean(regs); } } static inline void riscv_v_vstate_restore(struct __riscv_v_ext_state *vstate, struct pt_regs *regs) { if ((regs->status & SR_VS) != SR_VS_OFF) { __riscv_v_vstate_restore(vstate, vstate->datap); __riscv_v_vstate_clean(regs); } } static inline void riscv_v_vstate_set_restore(struct task_struct *task, struct pt_regs *regs) { if ((regs->status & SR_VS) != SR_VS_OFF) { set_tsk_thread_flag(task, TIF_RISCV_V_DEFER_RESTORE); riscv_v_vstate_on(regs); } } #ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE static inline bool riscv_preempt_v_dirty(struct task_struct *task) { return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V_DIRTY); } static inline bool riscv_preempt_v_restore(struct task_struct *task) { return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V_NEED_RESTORE); } static inline void riscv_preempt_v_clear_dirty(struct task_struct *task) { barrier(); task->thread.riscv_v_flags &= ~RISCV_PREEMPT_V_DIRTY; } static inline void riscv_preempt_v_set_restore(struct task_struct *task) { barrier(); task->thread.riscv_v_flags |= RISCV_PREEMPT_V_NEED_RESTORE; } static inline bool riscv_preempt_v_started(struct task_struct *task) { return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V); } #else /* !CONFIG_RISCV_ISA_V_PREEMPTIVE */ static inline bool riscv_preempt_v_dirty(struct task_struct *task) { return false; } static inline bool riscv_preempt_v_restore(struct task_struct *task) { return false; } static inline bool riscv_preempt_v_started(struct task_struct *task) { return false; } #define riscv_preempt_v_clear_dirty(tsk) do {} while (0) #define riscv_preempt_v_set_restore(tsk) do {} while (0) #endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ static inline void __switch_to_vector(struct task_struct *prev, struct task_struct *next) { struct pt_regs *regs; if (riscv_preempt_v_started(prev)) { if (riscv_preempt_v_dirty(prev)) { __riscv_v_vstate_save(&prev->thread.kernel_vstate, prev->thread.kernel_vstate.datap); riscv_preempt_v_clear_dirty(prev); } } else { regs = task_pt_regs(prev); riscv_v_vstate_save(&prev->thread.vstate, regs); } if (riscv_preempt_v_started(next)) riscv_preempt_v_set_restore(next); else riscv_v_vstate_set_restore(next, task_pt_regs(next)); } void riscv_v_vstate_ctrl_init(struct task_struct *tsk); bool riscv_v_vstate_ctrl_user_allowed(void); #else /* ! CONFIG_RISCV_ISA_V */ struct pt_regs; static inline int riscv_v_setup_vsize(void) { return -EOPNOTSUPP; } static __always_inline bool has_vector(void) { return false; } static inline bool riscv_v_first_use_handler(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_query(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define riscv_v_vsize (0) #define riscv_v_vstate_discard(regs) do {} while (0) #define riscv_v_vstate_save(vstate, regs) do {} while (0) #define riscv_v_vstate_restore(vstate, regs) do {} while (0) #define __switch_to_vector(__prev, __next) do {} while (0) #define riscv_v_vstate_off(regs) do {} while (0) #define riscv_v_vstate_on(regs) do {} while (0) #define riscv_v_thread_free(tsk) do {} while (0) #define riscv_v_setup_ctx_cache() do {} while (0) #define riscv_v_thread_alloc(tsk) do {} while (0) #endif /* CONFIG_RISCV_ISA_V */ #endif /* ! __ASM_RISCV_VECTOR_H */ |
| 30 30 30 7 7 7 30 14 14 7 14 14 29 29 29 30 30 30 29 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 | /* * PCM Interface - misc routines * Copyright (c) 1998 by Jaroslav Kysela <perex@perex.cz> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <linux/export.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_local.h" #define SND_PCM_FORMAT_UNKNOWN (-1) /* NOTE: "signed" prefix must be given below since the default char is * unsigned on some architectures! */ struct pcm_format_data { unsigned char width; /* bit width */ unsigned char phys; /* physical bit width */ signed char le; /* 0 = big-endian, 1 = little-endian, -1 = others */ signed char signd; /* 0 = unsigned, 1 = signed, -1 = others */ unsigned char silence[8]; /* silence data to fill */ }; /* we do lots of calculations on snd_pcm_format_t; shut up sparse */ #define INT __force int static bool valid_format(snd_pcm_format_t format) { return (INT)format >= 0 && (INT)format <= (INT)SNDRV_PCM_FORMAT_LAST; } static const struct pcm_format_data pcm_formats[(INT)SNDRV_PCM_FORMAT_LAST+1] = { [SNDRV_PCM_FORMAT_S8] = { .width = 8, .phys = 8, .le = -1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U8] = { .width = 8, .phys = 8, .le = -1, .signd = 0, .silence = { 0x80 }, }, [SNDRV_PCM_FORMAT_S16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 0, .silence = { 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 0, .silence = { 0x80, 0x00 }, }, [SNDRV_PCM_FORMAT_S24_LE] = { .width = 24, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S24_BE] = { .width = 24, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U24_LE] = { .width = 24, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U24_BE] = { .width = 24, .phys = 32, .le = 0, .signd = 0, .silence = { 0x00, 0x80, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 0, .silence = { 0x80, 0x00, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_FLOAT_LE] = { .width = 32, .phys = 32, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT_BE] = { .width = 32, .phys = 32, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT64_LE] = { .width = 64, .phys = 64, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_FLOAT64_BE] = { .width = 64, .phys = 64, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_IEC958_SUBFRAME_LE] = { .width = 32, .phys = 32, .le = 1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_IEC958_SUBFRAME_BE] = { .width = 32, .phys = 32, .le = 0, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_MU_LAW] = { .width = 8, .phys = 8, .le = -1, .signd = -1, .silence = { 0x7f }, }, [SNDRV_PCM_FORMAT_A_LAW] = { .width = 8, .phys = 8, .le = -1, .signd = -1, .silence = { 0x55 }, }, [SNDRV_PCM_FORMAT_IMA_ADPCM] = { .width = 4, .phys = 4, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_24] = { .width = 3, .phys = 3, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_40] = { .width = 5, .phys = 5, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_DSD_U8] = { .width = 8, .phys = 8, .le = 1, .signd = 0, .silence = { 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U16_LE] = { .width = 16, .phys = 16, .le = 1, .signd = 0, .silence = { 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U32_LE] = { .width = 32, .phys = 32, .le = 1, .signd = 0, .silence = { 0x69, 0x69, 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U16_BE] = { .width = 16, .phys = 16, .le = 0, .signd = 0, .silence = { 0x69, 0x69 }, }, [SNDRV_PCM_FORMAT_DSD_U32_BE] = { .width = 32, .phys = 32, .le = 0, .signd = 0, .silence = { 0x69, 0x69, 0x69, 0x69 }, }, /* FIXME: the following two formats are not defined properly yet */ [SNDRV_PCM_FORMAT_MPEG] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_GSM] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_S20_LE] = { .width = 20, .phys = 32, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S20_BE] = { .width = 20, .phys = 32, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U20_LE] = { .width = 20, .phys = 32, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x08, 0x00 }, }, [SNDRV_PCM_FORMAT_U20_BE] = { .width = 20, .phys = 32, .le = 0, .signd = 0, .silence = { 0x00, 0x08, 0x00, 0x00 }, }, /* FIXME: the following format is not defined properly yet */ [SNDRV_PCM_FORMAT_SPECIAL] = { .le = -1, .signd = -1, }, [SNDRV_PCM_FORMAT_S24_3LE] = { .width = 24, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S24_3BE] = { .width = 24, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U24_3LE] = { .width = 24, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x80 }, }, [SNDRV_PCM_FORMAT_U24_3BE] = { .width = 24, .phys = 24, .le = 0, .signd = 0, .silence = { 0x80, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S20_3LE] = { .width = 20, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S20_3BE] = { .width = 20, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U20_3LE] = { .width = 20, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x08 }, }, [SNDRV_PCM_FORMAT_U20_3BE] = { .width = 20, .phys = 24, .le = 0, .signd = 0, .silence = { 0x08, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_S18_3LE] = { .width = 18, .phys = 24, .le = 1, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_S18_3BE] = { .width = 18, .phys = 24, .le = 0, .signd = 1, .silence = {}, }, [SNDRV_PCM_FORMAT_U18_3LE] = { .width = 18, .phys = 24, .le = 1, .signd = 0, .silence = { 0x00, 0x00, 0x02 }, }, [SNDRV_PCM_FORMAT_U18_3BE] = { .width = 18, .phys = 24, .le = 0, .signd = 0, .silence = { 0x02, 0x00, 0x00 }, }, [SNDRV_PCM_FORMAT_G723_24_1B] = { .width = 3, .phys = 8, .le = -1, .signd = -1, .silence = {}, }, [SNDRV_PCM_FORMAT_G723_40_1B] = { .width = 5, .phys = 8, .le = -1, .signd = -1, .silence = {}, }, }; /** * snd_pcm_format_signed - Check the PCM format is signed linear * @format: the format to check * * Return: 1 if the given PCM format is signed linear, 0 if unsigned * linear, and a negative error code for non-linear formats. */ int snd_pcm_format_signed(snd_pcm_format_t format) { int val; if (!valid_format(format)) return -EINVAL; val = pcm_formats[(INT)format].signd; if (val < 0) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_signed); /** * snd_pcm_format_unsigned - Check the PCM format is unsigned linear * @format: the format to check * * Return: 1 if the given PCM format is unsigned linear, 0 if signed * linear, and a negative error code for non-linear formats. */ int snd_pcm_format_unsigned(snd_pcm_format_t format) { int val; val = snd_pcm_format_signed(format); if (val < 0) return val; return !val; } EXPORT_SYMBOL(snd_pcm_format_unsigned); /** * snd_pcm_format_linear - Check the PCM format is linear * @format: the format to check * * Return: 1 if the given PCM format is linear, 0 if not. */ int snd_pcm_format_linear(snd_pcm_format_t format) { return snd_pcm_format_signed(format) >= 0; } EXPORT_SYMBOL(snd_pcm_format_linear); /** * snd_pcm_format_little_endian - Check the PCM format is little-endian * @format: the format to check * * Return: 1 if the given PCM format is little-endian, 0 if * big-endian, or a negative error code if endian not specified. */ int snd_pcm_format_little_endian(snd_pcm_format_t format) { int val; if (!valid_format(format)) return -EINVAL; val = pcm_formats[(INT)format].le; if (val < 0) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_little_endian); /** * snd_pcm_format_big_endian - Check the PCM format is big-endian * @format: the format to check * * Return: 1 if the given PCM format is big-endian, 0 if * little-endian, or a negative error code if endian not specified. */ int snd_pcm_format_big_endian(snd_pcm_format_t format) { int val; val = snd_pcm_format_little_endian(format); if (val < 0) return val; return !val; } EXPORT_SYMBOL(snd_pcm_format_big_endian); /** * snd_pcm_format_width - return the bit-width of the format * @format: the format to check * * Return: The bit-width of the format, or a negative error code * if unknown format. */ int snd_pcm_format_width(snd_pcm_format_t format) { int val; if (!valid_format(format)) return -EINVAL; val = pcm_formats[(INT)format].width; if (!val) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_width); /** * snd_pcm_format_physical_width - return the physical bit-width of the format * @format: the format to check * * Return: The physical bit-width of the format, or a negative error code * if unknown format. */ int snd_pcm_format_physical_width(snd_pcm_format_t format) { int val; if (!valid_format(format)) return -EINVAL; val = pcm_formats[(INT)format].phys; if (!val) return -EINVAL; return val; } EXPORT_SYMBOL(snd_pcm_format_physical_width); /** * snd_pcm_format_size - return the byte size of samples on the given format * @format: the format to check * @samples: sampling rate * * Return: The byte size of the given samples for the format, or a * negative error code if unknown format. */ ssize_t snd_pcm_format_size(snd_pcm_format_t format, size_t samples) { int phys_width = snd_pcm_format_physical_width(format); if (phys_width < 0) return -EINVAL; return samples * phys_width / 8; } EXPORT_SYMBOL(snd_pcm_format_size); /** * snd_pcm_format_silence_64 - return the silent data in 8 bytes array * @format: the format to check * * Return: The format pattern to fill or %NULL if error. */ const unsigned char *snd_pcm_format_silence_64(snd_pcm_format_t format) { if (!valid_format(format)) return NULL; if (! pcm_formats[(INT)format].phys) return NULL; return pcm_formats[(INT)format].silence; } EXPORT_SYMBOL(snd_pcm_format_silence_64); /** * snd_pcm_format_set_silence - set the silence data on the buffer * @format: the PCM format * @data: the buffer pointer * @samples: the number of samples to set silence * * Sets the silence data on the buffer for the given samples. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_format_set_silence(snd_pcm_format_t format, void *data, unsigned int samples) { int width; unsigned char *dst; const unsigned char *pat; if (!valid_format(format)) return -EINVAL; if (samples == 0) return 0; width = pcm_formats[(INT)format].phys; /* physical width */ pat = pcm_formats[(INT)format].silence; if (!width || !pat) return -EINVAL; /* signed or 1 byte data */ if (pcm_formats[(INT)format].signd == 1 || width <= 8) { unsigned int bytes = samples * width / 8; memset(data, *pat, bytes); return 0; } /* non-zero samples, fill using a loop */ width /= 8; dst = data; #if 0 while (samples--) { memcpy(dst, pat, width); dst += width; } #else /* a bit optimization for constant width */ switch (width) { case 2: while (samples--) { memcpy(dst, pat, 2); dst += 2; } break; case 3: while (samples--) { memcpy(dst, pat, 3); dst += 3; } break; case 4: while (samples--) { memcpy(dst, pat, 4); dst += 4; } break; case 8: while (samples--) { memcpy(dst, pat, 8); dst += 8; } break; } #endif return 0; } EXPORT_SYMBOL(snd_pcm_format_set_silence); /** * snd_pcm_hw_limit_rates - determine rate_min/rate_max fields * @hw: the pcm hw instance * * Determines the rate_min and rate_max fields from the rates bits of * the given hw. * * Return: Zero if successful. */ int snd_pcm_hw_limit_rates(struct snd_pcm_hardware *hw) { int i; for (i = 0; i < (int)snd_pcm_known_rates.count; i++) { if (hw->rates & (1 << i)) { hw->rate_min = snd_pcm_known_rates.list[i]; break; } } for (i = (int)snd_pcm_known_rates.count - 1; i >= 0; i--) { if (hw->rates & (1 << i)) { hw->rate_max = snd_pcm_known_rates.list[i]; break; } } return 0; } EXPORT_SYMBOL(snd_pcm_hw_limit_rates); /** * snd_pcm_rate_to_rate_bit - converts sample rate to SNDRV_PCM_RATE_xxx bit * @rate: the sample rate to convert * * Return: The SNDRV_PCM_RATE_xxx flag that corresponds to the given rate, or * SNDRV_PCM_RATE_KNOT for an unknown rate. */ unsigned int snd_pcm_rate_to_rate_bit(unsigned int rate) { unsigned int i; for (i = 0; i < snd_pcm_known_rates.count; i++) if (snd_pcm_known_rates.list[i] == rate) return 1u << i; return SNDRV_PCM_RATE_KNOT; } EXPORT_SYMBOL(snd_pcm_rate_to_rate_bit); /** * snd_pcm_rate_bit_to_rate - converts SNDRV_PCM_RATE_xxx bit to sample rate * @rate_bit: the rate bit to convert * * Return: The sample rate that corresponds to the given SNDRV_PCM_RATE_xxx flag * or 0 for an unknown rate bit. */ unsigned int snd_pcm_rate_bit_to_rate(unsigned int rate_bit) { unsigned int i; for (i = 0; i < snd_pcm_known_rates.count; i++) if ((1u << i) == rate_bit) return snd_pcm_known_rates.list[i]; return 0; } EXPORT_SYMBOL(snd_pcm_rate_bit_to_rate); static unsigned int snd_pcm_rate_mask_sanitize(unsigned int rates) { if (rates & SNDRV_PCM_RATE_CONTINUOUS) return SNDRV_PCM_RATE_CONTINUOUS; else if (rates & SNDRV_PCM_RATE_KNOT) return SNDRV_PCM_RATE_KNOT; return rates; } /** * snd_pcm_rate_mask_intersect - computes the intersection between two rate masks * @rates_a: The first rate mask * @rates_b: The second rate mask * * This function computes the rates that are supported by both rate masks passed * to the function. It will take care of the special handling of * SNDRV_PCM_RATE_CONTINUOUS and SNDRV_PCM_RATE_KNOT. * * Return: A rate mask containing the rates that are supported by both rates_a * and rates_b. */ unsigned int snd_pcm_rate_mask_intersect(unsigned int rates_a, unsigned int rates_b) { rates_a = snd_pcm_rate_mask_sanitize(rates_a); rates_b = snd_pcm_rate_mask_sanitize(rates_b); if (rates_a & SNDRV_PCM_RATE_CONTINUOUS) return rates_b; else if (rates_b & SNDRV_PCM_RATE_CONTINUOUS) return rates_a; else if (rates_a & SNDRV_PCM_RATE_KNOT) return rates_b; else if (rates_b & SNDRV_PCM_RATE_KNOT) return rates_a; return rates_a & rates_b; } EXPORT_SYMBOL_GPL(snd_pcm_rate_mask_intersect); /** * snd_pcm_rate_range_to_bits - converts rate range to SNDRV_PCM_RATE_xxx bit * @rate_min: the minimum sample rate * @rate_max: the maximum sample rate * * This function has an implicit assumption: the rates in the given range have * only the pre-defined rates like 44100 or 16000. * * Return: The SNDRV_PCM_RATE_xxx flag that corresponds to the given rate range, * or SNDRV_PCM_RATE_KNOT for an unknown range. */ unsigned int snd_pcm_rate_range_to_bits(unsigned int rate_min, unsigned int rate_max) { unsigned int rates = 0; int i; for (i = 0; i < snd_pcm_known_rates.count; i++) { if (snd_pcm_known_rates.list[i] >= rate_min && snd_pcm_known_rates.list[i] <= rate_max) rates |= 1 << i; } if (!rates) rates = SNDRV_PCM_RATE_KNOT; return rates; } EXPORT_SYMBOL_GPL(snd_pcm_rate_range_to_bits); |
| 373 50 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FILELOCK_H #define _LINUX_FILELOCK_H #include <linux/fs.h> #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_DELEG 4 /* NFSv4 delegation */ #define FL_ACCESS 8 /* not trying to lock, just looking */ #define FL_EXISTS 16 /* when unlocking, test for existence */ #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ #define FL_RECLAIM 4096 /* reclaiming from a reboot server */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. */ #define FILE_LOCK_DEFERRED 1 struct file_lock; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); }; struct lock_manager_operations { void *lm_mod_owner; fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); bool (*lm_break)(struct file_lock *); int (*lm_change)(struct file_lock *, int, struct list_head *); void (*lm_setup)(struct file_lock *, void **); bool (*lm_breaker_owns_lease)(struct file_lock *); bool (*lm_lock_expirable)(struct file_lock *cfl); void (*lm_expire_lock)(void); }; struct lock_manager { struct list_head list; /* * NFSv4 and up also want opens blocked during the grace period; * NLM doesn't care: */ bool block_opens; }; struct net; void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); bool locks_in_grace(struct net *); bool opens_in_grace(struct net *); /* * struct file_lock has a union that some filesystems use to track * their own private info. The NFS side of things is defined here: */ #include <linux/nfs_fs_i.h> /* * struct file_lock represents a generic "file lock". It's used to represent * POSIX byte range locks, BSD (flock) locks, and leases. It's important to * note that the same struct is used to represent both a request for a lock and * the lock itself, but the same object is never used for both. * * FIXME: should we create a separate "struct lock_request" to help distinguish * these two uses? * * The varous i_flctx lists are ordered by: * * 1) lock owner * 2) lock range start * 3) lock range end * * Obviously, the last two criteria only matter for POSIX locks. */ struct file_lock { struct file_lock *fl_blocker; /* The lock, that is blocking us */ struct list_head fl_list; /* link into file_lock_context */ struct hlist_node fl_link; /* node in global lists */ struct list_head fl_blocked_requests; /* list of requests with * ->fl_blocker pointing here */ struct list_head fl_blocked_member; /* node in * ->fl_blocker->fl_blocked_requests */ fl_owner_t fl_owner; unsigned int fl_flags; unsigned char fl_type; unsigned int fl_pid; int fl_link_cpu; /* what cpu's list is this on? */ wait_queue_head_t fl_wait; struct file *fl_file; loff_t fl_start; loff_t fl_end; struct fasync_struct * fl_fasync; /* for lease break notifications */ /* for lease breaks: */ unsigned long fl_break_time; unsigned long fl_downgrade_time; const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { struct nfs_lock_info nfs_fl; struct nfs4_lock_info nfs4_fl; struct { struct list_head link; /* link in AFS vnode's pending_locks list */ int state; /* state of grant or error if -ve */ unsigned int debug_id; } afs; struct { struct inode *inode; } ceph; } fl_u; } __randomize_layout; struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; struct list_head flc_posix; struct list_head flc_lease; }; #ifdef CONFIG_FILE_LOCKING int fcntl_getlk(struct file *, unsigned int, struct flock *); int fcntl_setlk(unsigned int, struct file *, unsigned int, struct flock *); #if BITS_PER_LONG == 32 int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); void locks_init_lock(struct file_lock *); struct file_lock * locks_alloc_lock(void); void locks_copy_lock(struct file_lock *, struct file_lock *); void locks_copy_conflock(struct file_lock *, struct file_lock *); void locks_remove_posix(struct file *, fl_owner_t); void locks_remove_file(struct file *); void locks_release_private(struct file_lock *); void posix_test_lock(struct file *, struct file_lock *); int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); int locks_delete_block(struct file_lock *); int vfs_test_lock(struct file *, struct file_lock *); int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); int vfs_cancel_lock(struct file *filp, struct file_lock *fl); bool vfs_inode_has_locks(struct inode *inode); int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lock **, void **priv); int vfs_setlease(struct file *, int, struct file_lock **, void **); int lease_modify(struct file_lock *, int, struct list_head *); struct notifier_block; int lease_register_notifier(struct notifier_block *); void lease_unregister_notifier(struct notifier_block *); struct files_struct; void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner); static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return smp_load_acquire(&inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) { return -EINVAL; } static inline int fcntl_setlk(unsigned int fd, struct file *file, unsigned int cmd, struct flock __user *user) { return -EACCES; } #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, struct flock64 *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, unsigned int cmd, struct flock64 *user) { return -EACCES; } #endif static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { return -EINVAL; } static inline int fcntl_getlease(struct file *filp) { return F_UNLCK; } static inline void locks_free_lock_context(struct inode *inode) { } static inline void locks_init_lock(struct file_lock *fl) { return; } static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) { return; } static inline void locks_remove_file(struct file *filp) { return; } static inline void posix_test_lock(struct file *filp, struct file_lock *fl) { return; } static inline int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { return -ENOLCK; } static inline int locks_delete_block(struct file_lock *waiter) { return -ENOENT; } static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { return -ENOLCK; } static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline bool vfs_inode_has_locks(struct inode *inode) { return false; } static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { return -ENOLCK; } static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { return 0; } static inline void lease_get_mtime(struct inode *inode, struct timespec64 *time) { return; } static inline int generic_setlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { return -EINVAL; } static inline int vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) { return -EINVAL; } static inline int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose) { return -EINVAL; } struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner) { return false; } static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return NULL; } #endif /* !CONFIG_FILE_LOCKING */ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { return locks_lock_inode_wait(file_inode(filp), fl); } #ifdef CONFIG_FILE_LOCKING static inline int break_lease(struct inode *inode, unsigned int mode) { /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, mode, FL_LEASE); return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, mode, FL_DELEG); return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { int ret; ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); if (ret == -EWOULDBLOCK && delegated_inode) { *delegated_inode = inode; ihold(inode); } return ret; } static inline int break_deleg_wait(struct inode **delegated_inode) { int ret; ret = break_deleg(*delegated_inode, O_WRONLY); iput(*delegated_inode); *delegated_inode = NULL; return ret; } static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, FL_LAYOUT); return 0; } #else /* !CONFIG_FILE_LOCKING */ static inline int break_lease(struct inode *inode, unsigned int mode) { return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { return 0; } static inline int break_deleg_wait(struct inode **delegated_inode) { BUG(); return 0; } static inline int break_layout(struct inode *inode, bool wait) { return 0; } #endif /* CONFIG_FILE_LOCKING */ #endif /* _LINUX_FILELOCK_H */ |
| 476 476 476 475 476 1159 23 1161 1159 1159 1159 1160 1161 1160 1159 323 323 323 323 323 323 323 323 323 323 323 323 323 323 453 452 452 452 452 452 11 453 395 425 149 424 424 424 424 424 425 424 115 116 1475 210 118 116 116 370 370 367 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> #include <linux/ima.h> #include <linux/swap.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ static struct files_stat_struct files_stat = { .max_files = NR_FILE }; /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; struct path user_path; }; static inline struct backing_file *backing_file(struct file *f) { return container_of(f, struct backing_file, file); } struct path *backing_file_user_path(struct file *f) { return &backing_file(f)->user_path; } EXPORT_SYMBOL_GPL(backing_file_user_path); static inline void file_free(struct file *f) { security_file_free(f); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); kfree(backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } } /* * Return the total number of open files in the system */ static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } /* * Return the maximum number of open files in the system */ unsigned long get_max_files(void) { return files_stat.max_files; } EXPORT_SYMBOL_GPL(get_max_files); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Handle nr_files sysctl */ static int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = SYSCTL_LONG_ZERO, .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, }; static int __init init_fs_stat_sysctls(void) { register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } return 0; } fs_initcall(init_fs_stat_sysctls); #endif static int init_file(struct file *f, int flags, const struct cred *cred) { int error; f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { put_cred(f->f_cred); return error; } rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ atomic_long_set(&f->f_count, 1); return 0; } /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; int error; /* * Privileged users can go above max_files */ if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } percpu_counter_inc(&nr_files); return f; over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } return ERR_PTR(-ENFILE); } /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { struct file *f; int error; f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } f->f_mode |= FMODE_NOACCOUNT; return f; } /* * Variant of alloc_empty_file() that allocates a backing_file container * and doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) { struct backing_file *ff; int error; ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { kfree(ff); return ERR_PTR(error); } ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; return &ff->file; } /** * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; file = alloc_empty_file(flags, current_cred()); if (IS_ERR(file)) return file; file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); if (fop->llseek) file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { struct qstr this = QSTR_INIT(name, strlen(name)); struct path path; struct file *file; path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path.dentry) return ERR_PTR(-ENOMEM); path.mnt = mntget(mnt); d_instantiate(path.dentry, inode); file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); } return file; } EXPORT_SYMBOL(alloc_file_pseudo); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { struct file *f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; } return f; } /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; might_sleep(); fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_file(file); ima_file_free(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); put_pid(file->f_owner.pid); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); } static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { __fput(container_of(work, struct file, f_task_work)); } /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we * *do* need to make sure our writes to binaries on initramfs has * not left us with opened struct file waiting for __fput() - execve() * won't work without that. Please, don't add more callers without * very good reasons; in particular, never call that with locks * held and never call that from a thread that might need to do * some work on any kind of umount. */ void flush_delayed_fput(void) { delayed_fput(NULL); } EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { file_free(file); return; } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_task_work, ____fput); if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without * risking deadlocks), need to wait for completion of __fput() and know * for this specific struct file it won't involve anything that would * need them. Use only if you really need it - at the very least, * don't blindly convert fput() by kernel thread to that. */ void __fput_sync(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) __fput(file); } EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } /* * One file with associated inode and dcache is very roughly 1K. Per default * do not use more than 10% of our memory for files. */ void __init files_maxfiles_init(void) { unsigned long n; unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); } |
| 58 58 58 58 46 58 1 1 1 49 49 49 49 49 46 49 49 49 49 47 46 47 48 49 48 49 49 46 49 9 9 9 9 9 9 9 9 9 9 15 15 15 15 15 58 59 59 49 49 210 1 118 118 118 111 111 111 111 35 3 34 35 323 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/libfs.c * Library for filesystems writers. */ #include <linux/blkdev.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/quotaops.h> #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> #include <linux/fsnotify.h> #include <linux/unicode.h> #include <linux/fscrypt.h> #include <linux/uaccess.h> #include "internal.h" int simple_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } EXPORT_SYMBOL(simple_getattr); int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { u64 id = huge_encode_dev(dentry->d_sb->s_dev); buf->f_fsid = u64_to_fsid(id); buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; return 0; } EXPORT_SYMBOL(simple_statfs); /* * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ int always_delete_dentry(const struct dentry *dentry) { return 1; } EXPORT_SYMBOL(always_delete_dentry); const struct dentry_operations simple_dentry_operations = { .d_delete = always_delete_dentry, }; EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already * exist, we know it is negative. Set d_op to delete negative dentries. */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) d_set_d_op(dentry, &simple_dentry_operations); d_add(dentry, NULL); return NULL; } EXPORT_SYMBOL(simple_lookup); int dcache_dir_open(struct inode *inode, struct file *file) { file->private_data = d_alloc_cursor(file->f_path.dentry); return file->private_data ? 0 : -ENOMEM; } EXPORT_SYMBOL(dcache_dir_open); int dcache_dir_close(struct inode *inode, struct file *file) { dput(file->private_data); return 0; } EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ /* * Returns an element of siblings' list. * We are looking for <count>th positive after <p>; if * found, dentry is grabbed and returned to caller. * If no such element exists, NULL is returned. */ static struct dentry *scan_positives(struct dentry *cursor, struct hlist_node **p, loff_t count, struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; spin_lock(&dentry->d_lock); while (*p) { struct dentry *d = hlist_entry(*p, struct dentry, d_sib); p = &d->d_sib.next; // we must at least skip cursors, to avoid livelocks if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; if (simple_positive(d) && !--count) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) found = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(found)) break; count = 1; } if (need_resched()) { if (!hlist_unhashed(&cursor->d_sib)) __hlist_del(&cursor->d_sib); hlist_add_behind(&cursor->d_sib, &d->d_sib); p = &cursor->d_sib.next; spin_unlock(&dentry->d_lock); cond_resched(); spin_lock(&dentry->d_lock); } } spin_unlock(&dentry->d_lock); dput(last); return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; switch (whence) { case 1: offset += file->f_pos; fallthrough; case 0: if (offset >= 0) break; fallthrough; default: return -EINVAL; } if (offset != file->f_pos) { struct dentry *cursor = file->private_data; struct dentry *to = NULL; inode_lock_shared(dentry->d_inode); if (offset > 2) to = scan_positives(cursor, &dentry->d_children.first, offset - 2, NULL); spin_lock(&dentry->d_lock); hlist_del_init(&cursor->d_sib); if (to) hlist_add_behind(&cursor->d_sib, &to->d_sib); spin_unlock(&dentry->d_lock); dput(to); file->f_pos = offset; inode_unlock_shared(dentry->d_inode); } return offset; } EXPORT_SYMBOL(dcache_dir_lseek); /* * Directory is locked and all positive dentries in it are safe, since * for ramfs-type trees they can't go away without unlink() or rmdir(), * both impossible due to the lock on directory. */ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; struct dentry *next = NULL; struct hlist_node **p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) p = &dentry->d_children.first; else p = &cursor->d_sib.next; while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; p = &next->d_sib.next; } spin_lock(&dentry->d_lock); hlist_del_init(&cursor->d_sib); if (next) hlist_add_before(&cursor->d_sib, &next->d_sib); spin_unlock(&dentry->d_lock); dput(next); return 0; } EXPORT_SYMBOL(dcache_readdir); ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { return -EISDIR; } EXPORT_SYMBOL(generic_read_dir); const struct file_operations simple_dir_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, .iterate_shared = dcache_readdir, .fsync = noop_fsync, }; EXPORT_SYMBOL(simple_dir_operations); const struct inode_operations simple_dir_inode_operations = { .lookup = simple_lookup, }; EXPORT_SYMBOL(simple_dir_inode_operations); static void offset_set(struct dentry *dentry, u32 offset) { dentry->d_fsdata = (void *)((uintptr_t)(offset)); } static u32 dentry2offset(struct dentry *dentry) { return (u32)((uintptr_t)(dentry->d_fsdata)); } static struct lock_class_key simple_offset_xa_lock; /** * simple_offset_init - initialize an offset_ctx * @octx: directory offset map to be initialized * */ void simple_offset_init(struct offset_ctx *octx) { xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); /* 0 is '.', 1 is '..', so always start with offset 2 */ octx->next_offset = 2; } /** * simple_offset_add - Add an entry to a directory's offset map * @octx: directory offset ctx to be updated * @dentry: new dentry being added * * Returns zero on success. @so_ctx and the dentry offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); u32 offset; int ret; if (dentry2offset(dentry) != 0) return -EBUSY; ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, &octx->next_offset, GFP_KERNEL); if (ret < 0) return ret; offset_set(dentry, offset); return 0; } /** * simple_offset_remove - Remove an entry to a directory's offset map * @octx: directory offset ctx to be updated * @dentry: dentry being removed * */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { u32 offset; offset = dentry2offset(dentry); if (offset == 0) return; xa_erase(&octx->xa, offset); offset_set(dentry, 0); } /** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved * @old_dentry: dentry being moved * @new_dir: destination parent * @new_dentry: destination dentry * * Returns zero on success. Otherwise a negative errno is returned and the * rename is rolled back. */ int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); u32 old_index = dentry2offset(old_dentry); u32 new_index = dentry2offset(new_dentry); int ret; simple_offset_remove(old_ctx, old_dentry); simple_offset_remove(new_ctx, new_dentry); ret = simple_offset_add(new_ctx, old_dentry); if (ret) goto out_restore; ret = simple_offset_add(old_ctx, new_dentry); if (ret) { simple_offset_remove(new_ctx, old_dentry); goto out_restore; } ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (ret) { simple_offset_remove(new_ctx, old_dentry); simple_offset_remove(old_ctx, new_dentry); goto out_restore; } return 0; out_restore: offset_set(old_dentry, old_index); xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); offset_set(new_dentry, new_index); xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); return ret; } /** * simple_offset_destroy - Release offset map * @octx: directory offset ctx that is about to be destroyed * * During fs teardown (eg. umount), a directory's offset map might still * contain entries. xa_destroy() cleans out anything that remains. */ void simple_offset_destroy(struct offset_ctx *octx) { xa_destroy(&octx->xa); } /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated * @offset: a byte offset * @whence: enumerator describing the starting position for this update * * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. * * Returns the updated read position if successful; otherwise a * negative errno is returned and the read position remains unchanged. */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: if (offset >= 0) break; fallthrough; default: return -EINVAL; } /* In this case, ->private_data is protected by f_pos_lock */ file->private_data = NULL; return vfs_setpos(file, offset, U32_MAX); } static struct dentry *offset_find_next(struct xa_state *xas) { struct dentry *child, *found = NULL; rcu_read_lock(); child = xas_next_entry(xas, U32_MAX); if (!child) goto out; spin_lock(&child->d_lock); if (simple_positive(child)) found = dget_dlock(child); spin_unlock(&child->d_lock); out: rcu_read_unlock(); return found; } static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { u32 offset = dentry2offset(dentry); struct inode *inode = d_inode(dentry); return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); XA_STATE(xas, &so_ctx->xa, ctx->pos); struct dentry *dentry; while (true) { dentry = offset_find_next(&xas); if (!dentry) return ERR_PTR(-ENOENT); if (!offset_dir_emit(ctx, dentry)) { dput(dentry); break; } dput(dentry); ctx->pos = xas.xa_index + 1; } return NULL; } /** * offset_readdir - Emit entries starting at offset @ctx->pos * @file: an open directory to iterate over * @ctx: directory iteration context * * Caller must hold @file's i_rwsem to prevent insertion or removal of * entries during this call. * * On entry, @ctx->pos contains an offset that represents the first entry * to be read from the directory. * * The operation continues until there are no more entries to read, or * until the ctx->actor indicates there is no more space in the caller's * output buffer. * * On return, @ctx->pos contains an offset that will read the next entry * in this directory when offset_readdir() is called again with @ctx. * * Return values: * %0 - Complete */ static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; /* In this case, ->private_data is protected by f_pos_lock */ if (ctx->pos == 2) file->private_data = NULL; else if (file->private_data == ERR_PTR(-ENOENT)) return 0; file->private_data = offset_iterate_dir(d_inode(dir), ctx); return 0; } const struct file_operations simple_offset_dir_operations = { .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, .fsync = noop_fsync, }; static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL, *d; spin_lock(&parent->d_lock); d = prev ? d_next_sibling(prev) : d_first_child(parent); hlist_for_each_entry_from(d, d_sib) { if (simple_positive(d)) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) child = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(child)) break; } } spin_unlock(&parent->d_lock); dput(prev); return child; } void simple_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) { struct dentry *this = dget(dentry); while (true) { struct dentry *victim = NULL, *child; struct inode *inode = this->d_inode; inode_lock(inode); if (d_is_dir(this)) inode->i_flags |= S_DEAD; while ((child = find_next_child(this, victim)) == NULL) { // kill and ascend // update metadata while it's still locked inode_set_ctime_current(inode); clear_nlink(inode); inode_unlock(inode); victim = this; this = this->d_parent; inode = this->d_inode; inode_lock(inode); if (simple_positive(victim)) { d_invalidate(victim); // avoid lost mounts if (d_is_dir(victim)) fsnotify_rmdir(inode, victim); else fsnotify_unlink(inode, victim); if (callback) callback(victim); dput(victim); // unpin it } if (victim == dentry) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (d_is_dir(dentry)) drop_nlink(inode); inode_unlock(inode); dput(dentry); return; } } inode_unlock(inode); this = child; } } EXPORT_SYMBOL(simple_recursive_removal); static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) { struct pseudo_fs_context *ctx = fc->fs_private; struct inode *root; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = ctx->magic; s->s_op = ctx->ops ?: &simple_super_operations; s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); if (!root) return -ENOMEM; /* * since this is the first inode, make it number 1. New inodes created * after this must take care not to collide with it (by passing * max_reserved of 1 to iunique). */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; simple_inode_init_ts(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; s->s_d_op = ctx->dops; return 0; } static int pseudo_fs_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, pseudo_fs_fill_super); } static void pseudo_fs_free(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations pseudo_fs_context_ops = { .free = pseudo_fs_free, .get_tree = pseudo_fs_get_tree, }; /* * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ struct pseudo_fs_context *init_pseudo(struct fs_context *fc, unsigned long magic) { struct pseudo_fs_context *ctx; ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); if (likely(ctx)) { ctx->magic = magic; fc->fs_private = ctx; fc->ops = &pseudo_fs_context_ops; fc->sb_flags |= SB_NOUSER; fc->global = true; } return ctx; } EXPORT_SYMBOL(init_pseudo); int simple_open(struct inode *inode, struct file *file) { if (inode->i_private) file->private_data = inode->i_private; return 0; } EXPORT_SYMBOL(simple_open); int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inc_nlink(inode); ihold(inode); dget(dentry); d_instantiate(dentry, inode); return 0; } EXPORT_SYMBOL(simple_link); int simple_empty(struct dentry *dentry) { struct dentry *child; int ret = 0; spin_lock(&dentry->d_lock); hlist_for_each_entry(child, &dentry->d_children, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); goto out; } spin_unlock(&child->d_lock); } ret = 1; out: spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(simple_empty); int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); dput(dentry); return 0; } EXPORT_SYMBOL(simple_unlink); int simple_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); simple_unlink(dir, dentry); drop_nlink(dir); return 0; } EXPORT_SYMBOL(simple_rmdir); /** * simple_rename_timestamp - update the various inode timestamps for rename * @old_dir: old parent directory * @old_dentry: dentry that is being renamed * @new_dir: new parent directory * @new_dentry: target for rename * * POSIX mandates that the old and new parent directories have their ctime and * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have * their ctime updated. */ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *newino = d_inode(new_dentry); inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (new_dir != old_dir) inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); inode_set_ctime_current(d_inode(old_dentry)); if (newino) inode_set_ctime_current(newino); } EXPORT_SYMBOL_GPL(simple_rename_timestamp); int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { bool old_is_dir = d_is_dir(old_dentry); bool new_is_dir = d_is_dir(new_dentry); if (old_dir != new_dir && old_is_dir != new_is_dir) { if (old_is_dir) { drop_nlink(old_dir); inc_nlink(new_dir); } else { drop_nlink(new_dir); inc_nlink(old_dir); } } simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL_GPL(simple_rename_exchange); int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int they_are_dirs = d_is_dir(old_dentry); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (d_really_is_positive(new_dentry)) { simple_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem * @idmap: idmap of the target mount * @dentry: dentry * @iattr: iattr structure * * Returns 0 on success, -error on failure. * * simple_setattr is a simple ->setattr implementation without a proper * implementation of size changes. * * It can either be used for in-memory filesystems or special files * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; error = setattr_prepare(idmap, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); setattr_copy(idmap, inode, iattr); mark_inode_dirty(inode); return 0; } EXPORT_SYMBOL(simple_setattr); static int simple_read_folio(struct file *file, struct folio *folio) { folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); folio_unlock(folio); return 0; } int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct folio *folio; folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); *pagep = &folio->page; if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + len, folio_size(folio)); } return 0; } EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes * @file: See .write_end of address_space_operations * @mapping: " * @pos: " * @len: " * @copied: " * @page: " * @fsdata: " * * simple_write_end does the minimum needed for updating a page after writing is * done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for * FSes that don't need any other processing. i_mutex is assumed to be held. * Block based filesystems should use generic_write_end(). * NOTE: Even though i_size might get updated by this function, mark_inode_dirty * is not called, so a filesystem that actually does store data in .write_inode * should extend on what's done here with a call to mark_inode_dirty() in the * case that i_size has changed. * * Use *ONLY* with simple_read_folio() */ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; /* zero the stale part of the folio if we did a short copy */ if (!folio_test_uptodate(folio)) { if (copied < len) { size_t from = offset_in_folio(folio, pos); folio_zero_range(folio, from + copied, len - copied); } folio_mark_uptodate(folio); } /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. */ if (last_pos > inode->i_size) i_size_write(inode, last_pos); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } /* * Provides ramfs-style behavior: data in the pagecache, but no writeback. */ const struct address_space_operations ram_aops = { .read_folio = simple_read_folio, .write_begin = simple_write_begin, .write_end = simple_write_end, .dirty_folio = noop_dirty_folio, }; EXPORT_SYMBOL(ram_aops); /* * the inodes created here are not hashed. If you use iunique to generate * unique inode values later for this filesystem, then you must take care * to pass it an appropriate max_reserved value to avoid collisions. */ int simple_fill_super(struct super_block *s, unsigned long magic, const struct tree_descr *files) { struct inode *inode; struct dentry *dentry; int i; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; s->s_op = &simple_super_operations; s->s_time_gran = 1; inode = new_inode(s); if (!inode) return -ENOMEM; /* * because the root inode is 1, the files array must not contain an * entry at index 1 */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; simple_inode_init_ts(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); s->s_root = d_make_root(inode); if (!s->s_root) return -ENOMEM; for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) continue; /* warn if it tries to conflict with the root inode */ if (unlikely(i == 1)) printk(KERN_WARNING "%s: %s passed in a files array" "with an index of 1!\n", __func__, s->s_type->name); dentry = d_alloc_name(s->s_root, files->name); if (!dentry) return -ENOMEM; inode = new_inode(s); if (!inode) { dput(dentry); return -ENOMEM; } inode->i_mode = S_IFREG | files->mode; simple_inode_init_ts(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); } return 0; } EXPORT_SYMBOL(simple_fill_super); static DEFINE_SPINLOCK(pin_fs_lock); int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) { struct vfsmount *mnt = NULL; spin_lock(&pin_fs_lock); if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (IS_ERR(mnt)) return PTR_ERR(mnt); spin_lock(&pin_fs_lock); if (!*mount) *mount = mnt; } mntget(*mount); ++*count; spin_unlock(&pin_fs_lock); mntput(mnt); return 0; } EXPORT_SYMBOL(simple_pin_fs); void simple_release_fs(struct vfsmount **mount, int *count) { struct vfsmount *mnt; spin_lock(&pin_fs_lock); mnt = *mount; if (!--*count) *mount = NULL; spin_unlock(&pin_fs_lock); mntput(mnt); } EXPORT_SYMBOL(simple_release_fs); /** * simple_read_from_buffer - copy data from the buffer to user space * @to: the user space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The simple_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the user space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; size_t ret; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; ret = copy_to_user(to, from + pos, count); if (ret == count) return -EFAULT; count -= ret; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_read_from_buffer); /** * simple_write_to_buffer - copy data from user space to the buffer * @to: the buffer to write to * @available: the size of the buffer * @ppos: the current position in the buffer * @from: the user space buffer to read from * @count: the maximum number of bytes to read * * The simple_write_to_buffer() function reads up to @count bytes from the user * space address starting at @from into the buffer @to at offset @ppos. * * On success, the number of bytes written is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count) { loff_t pos = *ppos; size_t res; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; res = copy_from_user(to + pos, from, count); if (res == count) return -EFAULT; count -= res; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_write_to_buffer); /** * memory_read_from_buffer - copy data from the buffer * @to: the kernel space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The memory_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the kernel space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; if (pos < 0) return -EINVAL; if (pos >= available) return 0; if (count > available - pos) count = available - pos; memcpy(to, from + pos, count); *ppos = pos + count; return count; } EXPORT_SYMBOL(memory_read_from_buffer); /* * Transaction based IO. * The file expects a single write which triggers the transaction, and then * possibly a read which collects the result - which is stored in a * file-local buffer. */ void simple_transaction_set(struct file *file, size_t n) { struct simple_transaction_argresp *ar = file->private_data; BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); /* * The barrier ensures that ar->size will really remain zero until * ar->data is ready for reading. */ smp_mb(); ar->size = n; } EXPORT_SYMBOL(simple_transaction_set); char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) { struct simple_transaction_argresp *ar; static DEFINE_SPINLOCK(simple_transaction_lock); if (size > SIMPLE_TRANSACTION_LIMIT - 1) return ERR_PTR(-EFBIG); ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); if (!ar) return ERR_PTR(-ENOMEM); spin_lock(&simple_transaction_lock); /* only one write allowed per open */ if (file->private_data) { spin_unlock(&simple_transaction_lock); free_page((unsigned long)ar); return ERR_PTR(-EBUSY); } file->private_data = ar; spin_unlock(&simple_transaction_lock); if (copy_from_user(ar->data, buf, size)) return ERR_PTR(-EFAULT); return ar->data; } EXPORT_SYMBOL(simple_transaction_get); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { struct simple_transaction_argresp *ar = file->private_data; if (!ar) return 0; return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); } EXPORT_SYMBOL(simple_transaction_read); int simple_transaction_release(struct inode *inode, struct file *file) { free_page((unsigned long)file->private_data); return 0; } EXPORT_SYMBOL(simple_transaction_release); /* Simple attribute files */ struct simple_attr { int (*get)(void *, u64 *); int (*set)(void *, u64); char get_buf[24]; /* enough to store a u64 and "\n\0" */ char set_buf[24]; void *data; const char *fmt; /* format for read operation */ struct mutex mutex; /* protects access to these buffers */ }; /* simple_attr_open is called by an actual attribute open file operation * to set the attribute specific access operations. */ int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { struct simple_attr *attr; attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; attr->get = get; attr->set = set; attr->data = inode->i_private; attr->fmt = fmt; mutex_init(&attr->mutex); file->private_data = attr; return nonseekable_open(inode, file); } EXPORT_SYMBOL_GPL(simple_attr_open); int simple_attr_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ /* read from the buffer that is filled with the get function */ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct simple_attr *attr; size_t size; ssize_t ret; attr = file->private_data; if (!attr->get) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; if (*ppos && attr->get_buf[0]) { /* continued read */ size = strlen(attr->get_buf); } else { /* first read */ u64 val; ret = attr->get(attr->data, &val); if (ret) goto out; size = scnprintf(attr->get_buf, sizeof(attr->get_buf), attr->fmt, (unsigned long long)val); } ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); out: mutex_unlock(&attr->mutex); return ret; } EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; size_t size; ssize_t ret; attr = file->private_data; if (!attr->set) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; ret = -EFAULT; size = min(sizeof(attr->set_buf) - 1, len); if (copy_from_user(attr->set_buf, buf, size)) goto out; attr->set_buf[size] = '\0'; if (is_signed) ret = kstrtoll(attr->set_buf, 0, &val); else ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); if (ret == 0) ret = len; /* on success, claim we got the whole input */ out: mutex_unlock(&attr->mutex); return ret; } ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(simple_attr_write); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(simple_attr_write_signed); /** * generic_encode_ino32_fh - generic export_operations->encode_fh function * @inode: the object to encode * @fh: where to store the file handle fragment * @max_len: maximum length to store there (in 4 byte units) * @parent: parent directory inode, if wanted * * This generic encode_fh function assumes that the 32 inode number * is suitable for locating an inode, and that the generation number * can be used to check that it is still valid. It places them in the * filehandle fragment where export_decode_fh expects to find them. */ int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { struct fid *fid = (void *)fh; int len = *max_len; int type = FILEID_INO32_GEN; if (parent && (len < 4)) { *max_len = 4; return FILEID_INVALID; } else if (len < 2) { *max_len = 2; return FILEID_INVALID; } len = 2; fid->i32.ino = inode->i_ino; fid->i32.gen = inode->i_generation; if (parent) { fid->i32.parent_ino = parent->i_ino; fid->i32.parent_gen = parent->i_generation; len = 4; type = FILEID_INO32_GEN_PARENT; } *max_len = len; return type; } EXPORT_SYMBOL_GPL(generic_encode_ino32_fh); /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the object specified in the file handle. */ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.ino, fid->i32.gen); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_dentry); /** * generic_fh_to_parent - generic helper for the fh_to_parent export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the _parent_ object specified in the file handle if it * is specified in the file handle, or NULL otherwise. */ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len <= 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.parent_ino, (fh_len > 3 ? fid->i32.parent_gen : 0)); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_parent); /** * __generic_file_fsync - generic fsync implementation for simple filesystems * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; int ret; err = file_write_and_wait_range(file, start, end); if (err) return err; inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; out: inode_unlock(inode); /* check and advance again to catch errors after syncing out buffers */ err = file_check_and_advance_wb_err(file); if (ret == 0) ret = err; return ret; } EXPORT_SYMBOL(__generic_file_fsync); /** * generic_file_fsync - generic fsync implementation for simple filesystems * with flush * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * */ int generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; err = __generic_file_fsync(file, start, end, datasync); if (err) return err; return blkdev_issue_flush(inode->i_sb->s_bdev); } EXPORT_SYMBOL(generic_file_fsync); /** * generic_check_addressable - Check addressability of file system * @blocksize_bits: log of file system block size * @num_blocks: number of blocks in file system * * Determine whether a file system with @num_blocks blocks (and a * block size of 2**@blocksize_bits) is addressable by the sector_t * and page cache of the system. Return 0 if so and -EFBIG otherwise. */ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; u64 last_fs_page = last_fs_block >> (PAGE_SHIFT - blocksize_bits); if (unlikely(num_blocks == 0)) return 0; if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { return -EFBIG; } return 0; } EXPORT_SYMBOL(generic_check_addressable); /* * No-op implementation of ->fsync for in-memory filesystems. */ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) { return 0; } EXPORT_SYMBOL(noop_fsync); ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* * iomap based filesystems support direct I/O without need for * this callback. However, it still needs to be set in * inode->a_ops so that open/fcntl know that direct I/O is * generally supported. */ return -EINVAL; } EXPORT_SYMBOL_GPL(noop_direct_IO); /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ void kfree_link(void *p) { kfree(p); } EXPORT_SYMBOL(kfree_link); struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { .dirty_folio = noop_dirty_folio, }; struct inode *inode = new_inode_pseudo(s); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &anon_aops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because mark_inode_dirty() will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; simple_inode_init_ts(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); /** * simple_nosetlease - generic helper for prohibiting leases * @filp: file pointer * @arg: type of lease to obtain * @flp: new lease supplied for insertion * @priv: private data for lm_setup operation * * Generic helper for filesystems that do not wish to allow leases to be set. * All arguments are ignored and it just returns -EINVAL. */ int simple_nosetlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { return -EINVAL; } EXPORT_SYMBOL(simple_nosetlease); /** * simple_get_link - generic helper to get the target of "fast" symlinks * @dentry: not used here * @inode: the symlink inode * @done: not used here * * Generic helper for filesystems to use for symlink inodes where a pointer to * the symlink target is stored in ->i_link. NOTE: this isn't normally called, * since as an optimization the path lookup code uses any non-NULL ->i_link * directly, without calling ->get_link(). But ->get_link() still must be set, * to mark the inode_operations as being for a symlink. * * Return: the symlink target */ const char *simple_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return inode->i_link; } EXPORT_SYMBOL(simple_get_link); const struct inode_operations simple_symlink_inode_operations = { .get_link = simple_get_link, }; EXPORT_SYMBOL(simple_symlink_inode_operations); /* * Operations for a permanently empty directory. */ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-ENOENT); } static int empty_dir_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EPERM; } static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) { return -EOPNOTSUPP; } static const struct inode_operations empty_dir_inode_operations = { .lookup = empty_dir_lookup, .permission = generic_permission, .setattr = empty_dir_setattr, .getattr = empty_dir_getattr, .listxattr = empty_dir_listxattr, }; static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) { /* An empty directory has two entries . and .. at offsets 0 and 1 */ return generic_file_llseek_size(file, offset, whence, 2, 2); } static int empty_dir_readdir(struct file *file, struct dir_context *ctx) { dir_emit_dots(file, ctx); return 0; } static const struct file_operations empty_dir_operations = { .llseek = empty_dir_llseek, .read = generic_read_dir, .iterate_shared = empty_dir_readdir, .fsync = noop_fsync, }; void make_empty_dir_inode(struct inode *inode) { set_nlink(inode, 2); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_rdev = 0; inode->i_size = 0; inode->i_blkbits = PAGE_SHIFT; inode->i_blocks = 0; inode->i_op = &empty_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &empty_dir_operations; } bool is_empty_dir_inode(struct inode *inode) { return (inode->i_fop == &empty_dir_operations) && (inode->i_op == &empty_dir_inode_operations); } #if IS_ENABLED(CONFIG_UNICODE) /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against * @len: len of name of dentry * @str: str pointer to name of dentry * @name: Name to compare against * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *dir = READ_ONCE(parent->d_inode); const struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; struct qstr qstr = QSTR_INIT(str, len); char strbuf[DNAME_INLINE_LEN]; int ret; if (!dir || !IS_CASEFOLDED(dir)) goto fallback; /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry * the lookup, so it doesn't matter what ->d_compare() returns. * However, it's unsafe to call utf8_strncasecmp() with an unstable * string. Therefore, we have to copy the name into a temporary buffer. */ if (len <= DNAME_INLINE_LEN - 1) { memcpy(strbuf, str, len); strbuf[len] = 0; qstr.name = strbuf; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } ret = utf8_strncasecmp(um, name, &qstr); if (ret >= 0) return ret; if (sb_has_strict_encoding(sb)) return -EINVAL; fallback: if (len != name->len) return 1; return !!memcmp(str, name->name, len); } /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems * @dentry: dentry of the parent directory * @str: qstr of name whose hash we should fill in * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; int ret = 0; if (!dir || !IS_CASEFOLDED(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); if (ret < 0 && sb_has_strict_encoding(sb)) return -EINVAL; return 0; } static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, }; #endif #ifdef CONFIG_FS_ENCRYPTION static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, }; #endif #if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) static const struct dentry_operations generic_encrypted_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, .d_revalidate = fscrypt_d_revalidate, }; #endif /** * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry * @dentry: dentry to set ops on * * Casefolded directories need d_hash and d_compare set, so that the dentries * contained in them are handled case-insensitively. Note that these operations * are needed on the parent directory rather than on the dentries in it, and * while the casefolding flag can be toggled on and off on an empty directory, * dentry_operations can't be changed later. As a result, if the filesystem has * casefolding support enabled at all, we have to give all dentries the * casefolding operations even if their inode doesn't have the casefolding flag * currently (and thus the casefolding ops would be no-ops for now). * * Encryption works differently in that the only dentry operation it needs is * d_revalidate, which it only needs on dentries that have the no-key name flag. * The no-key flag can't be set "later", so we don't have to worry about that. * * Finally, to maximize compatibility with overlayfs (which isn't compatible * with certain dentry operations) and to avoid taking an unnecessary * performance hit, we use custom dentry_operations for each possible * combination rather than always installing all operations. */ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) { #ifdef CONFIG_FS_ENCRYPTION bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; #endif #if IS_ENABLED(CONFIG_UNICODE) bool needs_ci_ops = dentry->d_sb->s_encoding; #endif #if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) if (needs_encrypt_ops && needs_ci_ops) { d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); return; } #endif #ifdef CONFIG_FS_ENCRYPTION if (needs_encrypt_ops) { d_set_d_op(dentry, &generic_encrypted_dentry_ops); return; } #endif #if IS_ENABLED(CONFIG_UNICODE) if (needs_ci_ops) { d_set_d_op(dentry, &generic_ci_dentry_ops); return; } #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); /** * inode_maybe_inc_iversion - increments i_version * @inode: inode with the i_version that should be updated * @force: increment the counter even if it's not necessary? * * Every time the inode is modified, the i_version field must be seen to have * changed by any observer. * * If "force" is set or the QUERIED flag is set, then ensure that we increment * the value, and clear the queried flag. * * In the common case where neither is set, then we can return "false" without * updating i_version. * * If this function returns false, and no other metadata has changed, then we * can avoid logging the metadata. */ bool inode_maybe_inc_iversion(struct inode *inode, bool force) { u64 cur, new; /* * The i_version field is not strictly ordered with any other inode * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * * Here, we add full memory barriers to ensure that any de-facto * ordering with other info is preserved. * * This barrier pairs with the barrier in inode_query_iversion() */ smp_mb(); cur = inode_peek_iversion_raw(inode); do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); /** * inode_query_iversion - read i_version for later use * @inode: inode from which i_version should be read * * Read the inode i_version counter. This should be used by callers that wish * to store the returned i_version for later comparison. This will guarantee * that a later query of the i_version will result in a different value if * anything has changed. * * In this implementation, we fetch the current value, set the QUERIED flag and * then try to swap it into place with a cmpxchg, if it wasn't already set. If * that fails, we try again with the newly fetched value from the cmpxchg. */ u64 inode_query_iversion(struct inode *inode) { u64 cur, new; cur = inode_peek_iversion_raw(inode); do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { /* * This barrier (and the implicit barrier in the * cmpxchg below) pairs with the barrier in * inode_maybe_inc_iversion(). */ smp_mb(); break; } new = cur | I_VERSION_QUERIED; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; } EXPORT_SYMBOL(inode_query_iversion); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos - buffered_written; loff_t end = iocb->ki_pos - 1; int err; /* * If the buffered write fallback returned an error, we want to return * the number of bytes which were written by direct I/O, or the error * code if that was zero. * * Note that this differs from normal direct-io semantics, which will * return -EFOO even if some bytes were written. */ if (unlikely(buffered_written < 0)) { if (direct_written) return direct_written; return buffered_written; } /* * We need to ensure that the page cache pages are written to disk and * invalidated to preserve the expected O_DIRECT semantics. */ err = filemap_write_and_wait_range(mapping, pos, end); if (err < 0) { /* * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err; } invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); return direct_written + buffered_written; } EXPORT_SYMBOL_GPL(direct_write_fallback); /** * simple_inode_init_ts - initialize the timestamps for a new inode * @inode: inode to be initialized * * When a new inode is created, most filesystems set the timestamps to the * current time. Add a helper to do this. */ struct timespec64 simple_inode_init_ts(struct inode *inode) { struct timespec64 ts = inode_set_ctime_current(inode); inode_set_atime_to_ts(inode, ts); inode_set_mtime_to_ts(inode, ts); return ts; } EXPORT_SYMBOL(simple_inode_init_ts); |
| 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 | /* * Copyright (c) 2006-2008 Intel Corporation * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> * * DRM core CRTC related functions * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. * * Authors: * Keith Packard * Eric Anholt <eric@anholt.net> * Dave Airlie <airlied@linux.ie> * Jesse Barnes <jesse.barnes@intel.com> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/dynamic_debug.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_atomic_uapi.h> #include <drm/drm_bridge.h> #include <drm/drm_crtc.h> #include <drm/drm_crtc_helper.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_print.h> #include <drm/drm_vblank.h> #include "drm_crtc_helper_internal.h" DECLARE_DYNDBG_CLASSMAP(drm_debug_classes, DD_CLASS_TYPE_DISJOINT_BITS, 0, "DRM_UT_CORE", "DRM_UT_DRIVER", "DRM_UT_KMS", "DRM_UT_PRIME", "DRM_UT_ATOMIC", "DRM_UT_VBL", "DRM_UT_STATE", "DRM_UT_LEASE", "DRM_UT_DP", "DRM_UT_DRMRES"); /** * DOC: overview * * The CRTC modeset helper library provides a default set_config implementation * in drm_crtc_helper_set_config(). Plus a few other convenience functions using * the same callbacks which drivers can use to e.g. restore the modeset * configuration on resume with drm_helper_resume_force_mode(). * * Note that this helper library doesn't track the current power state of CRTCs * and encoders. It can call callbacks like &drm_encoder_helper_funcs.dpms even * though the hardware is already in the desired state. This deficiency has been * fixed in the atomic helpers. * * The driver callbacks are mostly compatible with the atomic modeset helpers, * except for the handling of the primary plane: Atomic helpers require that the * primary plane is implemented as a real standalone plane and not directly tied * to the CRTC state. For easier transition this library provides functions to * implement the old semantics required by the CRTC helpers using the new plane * and atomic helper callbacks. * * Drivers are strongly urged to convert to the atomic helpers (by way of first * converting to the plane helpers). New drivers must not use these functions * but need to implement the atomic interface instead, potentially using the * atomic helpers for that. * * These legacy modeset helpers use the same function table structures as * all other modesetting helpers. See the documentation for struct * &drm_crtc_helper_funcs, &struct drm_encoder_helper_funcs and struct * &drm_connector_helper_funcs. */ /** * drm_helper_encoder_in_use - check if a given encoder is in use * @encoder: encoder to check * * Checks whether @encoder is with the current mode setting output configuration * in use by any connector. This doesn't mean that it is actually enabled since * the DPMS state is tracked separately. * * Returns: * True if @encoder is used, false otherwise. */ bool drm_helper_encoder_in_use(struct drm_encoder *encoder) { struct drm_connector *connector; struct drm_connector_list_iter conn_iter; struct drm_device *dev = encoder->dev; WARN_ON(drm_drv_uses_atomic_modeset(dev)); /* * We can expect this mutex to be locked if we are not panicking. * Locking is currently fubar in the panic handler. */ if (!oops_in_progress) { WARN_ON(!mutex_is_locked(&dev->mode_config.mutex)); WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex)); } drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (connector->encoder == encoder) { drm_connector_list_iter_end(&conn_iter); return true; } } drm_connector_list_iter_end(&conn_iter); return false; } EXPORT_SYMBOL(drm_helper_encoder_in_use); /** * drm_helper_crtc_in_use - check if a given CRTC is in a mode_config * @crtc: CRTC to check * * Checks whether @crtc is with the current mode setting output configuration * in use by any connector. This doesn't mean that it is actually enabled since * the DPMS state is tracked separately. * * Returns: * True if @crtc is used, false otherwise. */ bool drm_helper_crtc_in_use(struct drm_crtc *crtc) { struct drm_encoder *encoder; struct drm_device *dev = crtc->dev; WARN_ON(drm_drv_uses_atomic_modeset(dev)); /* * We can expect this mutex to be locked if we are not panicking. * Locking is currently fubar in the panic handler. */ if (!oops_in_progress) WARN_ON(!mutex_is_locked(&dev->mode_config.mutex)); drm_for_each_encoder(encoder, dev) if (encoder->crtc == crtc && drm_helper_encoder_in_use(encoder)) return true; return false; } EXPORT_SYMBOL(drm_helper_crtc_in_use); static void drm_encoder_disable(struct drm_encoder *encoder) { const struct drm_encoder_helper_funcs *encoder_funcs = encoder->helper_private; if (!encoder_funcs) return; if (encoder_funcs->disable) (*encoder_funcs->disable)(encoder); else if (encoder_funcs->dpms) (*encoder_funcs->dpms)(encoder, DRM_MODE_DPMS_OFF); } static void __drm_helper_disable_unused_functions(struct drm_device *dev) { struct drm_encoder *encoder; struct drm_crtc *crtc; drm_warn_on_modeset_not_all_locked(dev); drm_for_each_encoder(encoder, dev) { if (!drm_helper_encoder_in_use(encoder)) { drm_encoder_disable(encoder); /* disconnect encoder from any connector */ encoder->crtc = NULL; } } drm_for_each_crtc(crtc, dev) { const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; crtc->enabled = drm_helper_crtc_in_use(crtc); if (!crtc->enabled) { if (crtc_funcs->disable) (*crtc_funcs->disable)(crtc); else (*crtc_funcs->dpms)(crtc, DRM_MODE_DPMS_OFF); crtc->primary->fb = NULL; } } } /** * drm_helper_disable_unused_functions - disable unused objects * @dev: DRM device * * This function walks through the entire mode setting configuration of @dev. It * will remove any CRTC links of unused encoders and encoder links of * disconnected connectors. Then it will disable all unused encoders and CRTCs * either by calling their disable callback if available or by calling their * dpms callback with DRM_MODE_DPMS_OFF. * * NOTE: * * This function is part of the legacy modeset helper library and will cause * major confusion with atomic drivers. This is because atomic helpers guarantee * to never call ->disable() hooks on a disabled function, or ->enable() hooks * on an enabled functions. drm_helper_disable_unused_functions() on the other * hand throws such guarantees into the wind and calls disable hooks * unconditionally on unused functions. */ void drm_helper_disable_unused_functions(struct drm_device *dev) { WARN_ON(drm_drv_uses_atomic_modeset(dev)); drm_modeset_lock_all(dev); __drm_helper_disable_unused_functions(dev); drm_modeset_unlock_all(dev); } EXPORT_SYMBOL(drm_helper_disable_unused_functions); /* * Check the CRTC we're going to map each output to vs. its current * CRTC. If they don't match, we have to disable the output and the CRTC * since the driver will have to re-route things. */ static void drm_crtc_prepare_encoders(struct drm_device *dev) { const struct drm_encoder_helper_funcs *encoder_funcs; struct drm_encoder *encoder; drm_for_each_encoder(encoder, dev) { encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; /* Disable unused encoders */ if (encoder->crtc == NULL) drm_encoder_disable(encoder); } } /** * drm_crtc_helper_set_mode - internal helper to set a mode * @crtc: CRTC to program * @mode: mode to use * @x: horizontal offset into the surface * @y: vertical offset into the surface * @old_fb: old framebuffer, for cleanup * * Try to set @mode on @crtc. Give @crtc and its associated connectors a chance * to fixup or reject the mode prior to trying to set it. This is an internal * helper that drivers could e.g. use to update properties that require the * entire output pipe to be disabled and re-enabled in a new configuration. For * example for changing whether audio is enabled on a hdmi link or for changing * panel fitter or dither attributes. It is also called by the * drm_crtc_helper_set_config() helper function to drive the mode setting * sequence. * * Returns: * True if the mode was set successfully, false otherwise. */ bool drm_crtc_helper_set_mode(struct drm_crtc *crtc, struct drm_display_mode *mode, int x, int y, struct drm_framebuffer *old_fb) { struct drm_device *dev = crtc->dev; struct drm_display_mode *adjusted_mode, saved_mode, saved_hwmode; const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; const struct drm_encoder_helper_funcs *encoder_funcs; int saved_x, saved_y; bool saved_enabled; struct drm_encoder *encoder; bool ret = true; WARN_ON(drm_drv_uses_atomic_modeset(dev)); drm_warn_on_modeset_not_all_locked(dev); saved_enabled = crtc->enabled; crtc->enabled = drm_helper_crtc_in_use(crtc); if (!crtc->enabled) return true; adjusted_mode = drm_mode_duplicate(dev, mode); if (!adjusted_mode) { crtc->enabled = saved_enabled; return false; } drm_mode_init(&saved_mode, &crtc->mode); drm_mode_init(&saved_hwmode, &crtc->hwmode); saved_x = crtc->x; saved_y = crtc->y; /* Update crtc values up front so the driver can rely on them for mode * setting. */ drm_mode_copy(&crtc->mode, mode); crtc->x = x; crtc->y = y; /* Pass our mode to the connectors and the CRTC to give them a chance to * adjust it according to limitations or connector properties, and also * a chance to reject the mode entirely. */ drm_for_each_encoder(encoder, dev) { if (encoder->crtc != crtc) continue; encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; encoder_funcs = encoder->helper_private; if (encoder_funcs->mode_fixup) { if (!(ret = encoder_funcs->mode_fixup(encoder, mode, adjusted_mode))) { DRM_DEBUG_KMS("Encoder fixup failed\n"); goto done; } } } if (crtc_funcs->mode_fixup) { if (!(ret = crtc_funcs->mode_fixup(crtc, mode, adjusted_mode))) { DRM_DEBUG_KMS("CRTC fixup failed\n"); goto done; } } DRM_DEBUG_KMS("[CRTC:%d:%s]\n", crtc->base.id, crtc->name); drm_mode_copy(&crtc->hwmode, adjusted_mode); /* Prepare the encoders and CRTCs before setting the mode. */ drm_for_each_encoder(encoder, dev) { if (encoder->crtc != crtc) continue; encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; /* Disable the encoders as the first thing we do. */ if (encoder_funcs->prepare) encoder_funcs->prepare(encoder); } drm_crtc_prepare_encoders(dev); crtc_funcs->prepare(crtc); /* Set up the DPLL and any encoders state that needs to adjust or depend * on the DPLL. */ ret = !crtc_funcs->mode_set(crtc, mode, adjusted_mode, x, y, old_fb); if (!ret) goto done; drm_for_each_encoder(encoder, dev) { if (encoder->crtc != crtc) continue; encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; DRM_DEBUG_KMS("[ENCODER:%d:%s] set [MODE:%s]\n", encoder->base.id, encoder->name, mode->name); if (encoder_funcs->mode_set) encoder_funcs->mode_set(encoder, mode, adjusted_mode); } /* Now enable the clocks, plane, pipe, and connectors that we set up. */ crtc_funcs->commit(crtc); drm_for_each_encoder(encoder, dev) { if (encoder->crtc != crtc) continue; encoder_funcs = encoder->helper_private; if (!encoder_funcs) continue; if (encoder_funcs->commit) encoder_funcs->commit(encoder); } /* Calculate and store various constants which * are later needed by vblank and swap-completion * timestamping. They are derived from true hwmode. */ drm_calc_timestamping_constants(crtc, &crtc->hwmode); /* FIXME: add subpixel order */ done: drm_mode_destroy(dev, adjusted_mode); if (!ret) { crtc->enabled = saved_enabled; drm_mode_copy(&crtc->mode, &saved_mode); drm_mode_copy(&crtc->hwmode, &saved_hwmode); crtc->x = saved_x; crtc->y = saved_y; } return ret; } EXPORT_SYMBOL(drm_crtc_helper_set_mode); /** * drm_crtc_helper_atomic_check() - Helper to check CRTC atomic-state * @crtc: CRTC to check * @state: atomic state object * * Provides a default CRTC-state check handler for CRTCs that only have * one primary plane attached to it. This is often the case for the CRTC * of simple framebuffers. * * RETURNS: * Zero on success, or an errno code otherwise. */ int drm_crtc_helper_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state) { struct drm_crtc_state *new_crtc_state = drm_atomic_get_new_crtc_state(state, crtc); if (!new_crtc_state->enable) return 0; return drm_atomic_helper_check_crtc_primary_plane(new_crtc_state); } EXPORT_SYMBOL(drm_crtc_helper_atomic_check); static void drm_crtc_helper_disable(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; struct drm_connector *connector; struct drm_encoder *encoder; /* Decouple all encoders and their attached connectors from this crtc */ drm_for_each_encoder(encoder, dev) { struct drm_connector_list_iter conn_iter; if (encoder->crtc != crtc) continue; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (connector->encoder != encoder) continue; connector->encoder = NULL; /* * drm_helper_disable_unused_functions() ought to be * doing this, but since we've decoupled the encoder * from the connector above, the required connection * between them is henceforth no longer available. */ connector->dpms = DRM_MODE_DPMS_OFF; /* we keep a reference while the encoder is bound */ drm_connector_put(connector); } drm_connector_list_iter_end(&conn_iter); } __drm_helper_disable_unused_functions(dev); } /* * For connectors that support multiple encoders, either the * .atomic_best_encoder() or .best_encoder() operation must be implemented. */ struct drm_encoder * drm_connector_get_single_encoder(struct drm_connector *connector) { struct drm_encoder *encoder; WARN_ON(hweight32(connector->possible_encoders) > 1); drm_connector_for_each_possible_encoder(connector, encoder) return encoder; return NULL; } /** * drm_crtc_helper_set_config - set a new config from userspace * @set: mode set configuration * @ctx: lock acquire context, not used here * * The drm_crtc_helper_set_config() helper function implements the of * &drm_crtc_funcs.set_config callback for drivers using the legacy CRTC * helpers. * * It first tries to locate the best encoder for each connector by calling the * connector @drm_connector_helper_funcs.best_encoder helper operation. * * After locating the appropriate encoders, the helper function will call the * mode_fixup encoder and CRTC helper operations to adjust the requested mode, * or reject it completely in which case an error will be returned to the * application. If the new configuration after mode adjustment is identical to * the current configuration the helper function will return without performing * any other operation. * * If the adjusted mode is identical to the current mode but changes to the * frame buffer need to be applied, the drm_crtc_helper_set_config() function * will call the CRTC &drm_crtc_helper_funcs.mode_set_base helper operation. * * If the adjusted mode differs from the current mode, or if the * ->mode_set_base() helper operation is not provided, the helper function * performs a full mode set sequence by calling the ->prepare(), ->mode_set() * and ->commit() CRTC and encoder helper operations, in that order. * Alternatively it can also use the dpms and disable helper operations. For * details see &struct drm_crtc_helper_funcs and struct * &drm_encoder_helper_funcs. * * This function is deprecated. New drivers must implement atomic modeset * support, for which this function is unsuitable. Instead drivers should use * drm_atomic_helper_set_config(). * * Returns: * Returns 0 on success, negative errno numbers on failure. */ int drm_crtc_helper_set_config(struct drm_mode_set *set, struct drm_modeset_acquire_ctx *ctx) { struct drm_device *dev; struct drm_crtc **save_encoder_crtcs, *new_crtc; struct drm_encoder **save_connector_encoders, *new_encoder, *encoder; bool mode_changed = false; /* if true do a full mode set */ bool fb_changed = false; /* if true and !mode_changed just do a flip */ struct drm_connector *connector; struct drm_connector_list_iter conn_iter; int count = 0, ro, fail = 0; const struct drm_crtc_helper_funcs *crtc_funcs; struct drm_mode_set save_set; int ret; int i; DRM_DEBUG_KMS("\n"); BUG_ON(!set); BUG_ON(!set->crtc); BUG_ON(!set->crtc->helper_private); /* Enforce sane interface api - has been abused by the fb helper. */ BUG_ON(!set->mode && set->fb); BUG_ON(set->fb && set->num_connectors == 0); crtc_funcs = set->crtc->helper_private; dev = set->crtc->dev; WARN_ON(drm_drv_uses_atomic_modeset(dev)); if (!set->mode) set->fb = NULL; if (set->fb) { DRM_DEBUG_KMS("[CRTC:%d:%s] [FB:%d] #connectors=%d (x y) (%i %i)\n", set->crtc->base.id, set->crtc->name, set->fb->base.id, (int)set->num_connectors, set->x, set->y); } else { DRM_DEBUG_KMS("[CRTC:%d:%s] [NOFB]\n", set->crtc->base.id, set->crtc->name); drm_crtc_helper_disable(set->crtc); return 0; } drm_warn_on_modeset_not_all_locked(dev); /* * Allocate space for the backup of all (non-pointer) encoder and * connector data. */ save_encoder_crtcs = kcalloc(dev->mode_config.num_encoder, sizeof(struct drm_crtc *), GFP_KERNEL); if (!save_encoder_crtcs) return -ENOMEM; save_connector_encoders = kcalloc(dev->mode_config.num_connector, sizeof(struct drm_encoder *), GFP_KERNEL); if (!save_connector_encoders) { kfree(save_encoder_crtcs); return -ENOMEM; } /* * Copy data. Note that driver private data is not affected. * Should anything bad happen only the expected state is * restored, not the drivers personal bookkeeping. */ count = 0; drm_for_each_encoder(encoder, dev) { save_encoder_crtcs[count++] = encoder->crtc; } count = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) save_connector_encoders[count++] = connector->encoder; drm_connector_list_iter_end(&conn_iter); save_set.crtc = set->crtc; save_set.mode = &set->crtc->mode; save_set.x = set->crtc->x; save_set.y = set->crtc->y; save_set.fb = set->crtc->primary->fb; /* We should be able to check here if the fb has the same properties * and then just flip_or_move it */ if (set->crtc->primary->fb != set->fb) { /* If we have no fb then treat it as a full mode set */ if (set->crtc->primary->fb == NULL) { DRM_DEBUG_KMS("crtc has no fb, full mode set\n"); mode_changed = true; } else if (set->fb->format != set->crtc->primary->fb->format) { mode_changed = true; } else fb_changed = true; } if (set->x != set->crtc->x || set->y != set->crtc->y) fb_changed = true; if (!drm_mode_equal(set->mode, &set->crtc->mode)) { DRM_DEBUG_KMS("modes are different, full mode set\n"); drm_mode_debug_printmodeline(&set->crtc->mode); drm_mode_debug_printmodeline(set->mode); mode_changed = true; } /* take a reference on all unbound connectors in set, reuse the * already taken reference for bound connectors */ for (ro = 0; ro < set->num_connectors; ro++) { if (set->connectors[ro]->encoder) continue; drm_connector_get(set->connectors[ro]); } /* a) traverse passed in connector list and get encoders for them */ count = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { const struct drm_connector_helper_funcs *connector_funcs = connector->helper_private; new_encoder = connector->encoder; for (ro = 0; ro < set->num_connectors; ro++) { if (set->connectors[ro] == connector) { if (connector_funcs->best_encoder) new_encoder = connector_funcs->best_encoder(connector); else new_encoder = drm_connector_get_single_encoder(connector); /* if we can't get an encoder for a connector we are setting now - then fail */ if (new_encoder == NULL) /* don't break so fail path works correct */ fail = 1; if (connector->dpms != DRM_MODE_DPMS_ON) { DRM_DEBUG_KMS("connector dpms not on, full mode switch\n"); mode_changed = true; } break; } } if (new_encoder != connector->encoder) { DRM_DEBUG_KMS("encoder changed, full mode switch\n"); mode_changed = true; /* If the encoder is reused for another connector, then * the appropriate crtc will be set later. */ if (connector->encoder) connector->encoder->crtc = NULL; connector->encoder = new_encoder; } } drm_connector_list_iter_end(&conn_iter); if (fail) { ret = -EINVAL; goto fail; } count = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (!connector->encoder) continue; if (connector->encoder->crtc == set->crtc) new_crtc = NULL; else new_crtc = connector->encoder->crtc; for (ro = 0; ro < set->num_connectors; ro++) { if (set->connectors[ro] == connector) new_crtc = set->crtc; } /* Make sure the new CRTC will work with the encoder */ if (new_crtc && !drm_encoder_crtc_ok(connector->encoder, new_crtc)) { ret = -EINVAL; drm_connector_list_iter_end(&conn_iter); goto fail; } if (new_crtc != connector->encoder->crtc) { DRM_DEBUG_KMS("crtc changed, full mode switch\n"); mode_changed = true; connector->encoder->crtc = new_crtc; } if (new_crtc) { DRM_DEBUG_KMS("[CONNECTOR:%d:%s] to [CRTC:%d:%s]\n", connector->base.id, connector->name, new_crtc->base.id, new_crtc->name); } else { DRM_DEBUG_KMS("[CONNECTOR:%d:%s] to [NOCRTC]\n", connector->base.id, connector->name); } } drm_connector_list_iter_end(&conn_iter); /* mode_set_base is not a required function */ if (fb_changed && !crtc_funcs->mode_set_base) mode_changed = true; if (mode_changed) { if (drm_helper_crtc_in_use(set->crtc)) { DRM_DEBUG_KMS("attempting to set mode from" " userspace\n"); drm_mode_debug_printmodeline(set->mode); set->crtc->primary->fb = set->fb; if (!drm_crtc_helper_set_mode(set->crtc, set->mode, set->x, set->y, save_set.fb)) { DRM_ERROR("failed to set mode on [CRTC:%d:%s]\n", set->crtc->base.id, set->crtc->name); set->crtc->primary->fb = save_set.fb; ret = -EINVAL; goto fail; } DRM_DEBUG_KMS("Setting connector DPMS state to on\n"); for (i = 0; i < set->num_connectors; i++) { DRM_DEBUG_KMS("\t[CONNECTOR:%d:%s] set DPMS on\n", set->connectors[i]->base.id, set->connectors[i]->name); set->connectors[i]->funcs->dpms(set->connectors[i], DRM_MODE_DPMS_ON); } } __drm_helper_disable_unused_functions(dev); } else if (fb_changed) { set->crtc->x = set->x; set->crtc->y = set->y; set->crtc->primary->fb = set->fb; ret = crtc_funcs->mode_set_base(set->crtc, set->x, set->y, save_set.fb); if (ret != 0) { set->crtc->x = save_set.x; set->crtc->y = save_set.y; set->crtc->primary->fb = save_set.fb; goto fail; } } kfree(save_connector_encoders); kfree(save_encoder_crtcs); return 0; fail: /* Restore all previous data. */ count = 0; drm_for_each_encoder(encoder, dev) { encoder->crtc = save_encoder_crtcs[count++]; } count = 0; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) connector->encoder = save_connector_encoders[count++]; drm_connector_list_iter_end(&conn_iter); /* after fail drop reference on all unbound connectors in set, let * bound connectors keep their reference */ for (ro = 0; ro < set->num_connectors; ro++) { if (set->connectors[ro]->encoder) continue; drm_connector_put(set->connectors[ro]); } /* Try to restore the config */ if (mode_changed && !drm_crtc_helper_set_mode(save_set.crtc, save_set.mode, save_set.x, save_set.y, save_set.fb)) DRM_ERROR("failed to restore config after modeset failure\n"); kfree(save_connector_encoders); kfree(save_encoder_crtcs); return ret; } EXPORT_SYMBOL(drm_crtc_helper_set_config); static int drm_helper_choose_encoder_dpms(struct drm_encoder *encoder) { int dpms = DRM_MODE_DPMS_OFF; struct drm_connector *connector; struct drm_connector_list_iter conn_iter; struct drm_device *dev = encoder->dev; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) if (connector->encoder == encoder) if (connector->dpms < dpms) dpms = connector->dpms; drm_connector_list_iter_end(&conn_iter); return dpms; } /* Helper which handles bridge ordering around encoder dpms */ static void drm_helper_encoder_dpms(struct drm_encoder *encoder, int mode) { const struct drm_encoder_helper_funcs *encoder_funcs; encoder_funcs = encoder->helper_private; if (!encoder_funcs) return; if (encoder_funcs->dpms) encoder_funcs->dpms(encoder, mode); } static int drm_helper_choose_crtc_dpms(struct drm_crtc *crtc) { int dpms = DRM_MODE_DPMS_OFF; struct drm_connector *connector; struct drm_connector_list_iter conn_iter; struct drm_device *dev = crtc->dev; drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) if (connector->encoder && connector->encoder->crtc == crtc) if (connector->dpms < dpms) dpms = connector->dpms; drm_connector_list_iter_end(&conn_iter); return dpms; } /** * drm_helper_connector_dpms() - connector dpms helper implementation * @connector: affected connector * @mode: DPMS mode * * The drm_helper_connector_dpms() helper function implements the * &drm_connector_funcs.dpms callback for drivers using the legacy CRTC * helpers. * * This is the main helper function provided by the CRTC helper framework for * implementing the DPMS connector attribute. It computes the new desired DPMS * state for all encoders and CRTCs in the output mesh and calls the * &drm_crtc_helper_funcs.dpms and &drm_encoder_helper_funcs.dpms callbacks * provided by the driver. * * This function is deprecated. New drivers must implement atomic modeset * support, where DPMS is handled in the DRM core. * * Returns: * Always returns 0. */ int drm_helper_connector_dpms(struct drm_connector *connector, int mode) { struct drm_encoder *encoder = connector->encoder; struct drm_crtc *crtc = encoder ? encoder->crtc : NULL; int old_dpms, encoder_dpms = DRM_MODE_DPMS_OFF; WARN_ON(drm_drv_uses_atomic_modeset(connector->dev)); if (mode == connector->dpms) return 0; old_dpms = connector->dpms; connector->dpms = mode; if (encoder) encoder_dpms = drm_helper_choose_encoder_dpms(encoder); /* from off to on, do crtc then encoder */ if (mode < old_dpms) { if (crtc) { const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; if (crtc_funcs->dpms) (*crtc_funcs->dpms) (crtc, drm_helper_choose_crtc_dpms(crtc)); } if (encoder) drm_helper_encoder_dpms(encoder, encoder_dpms); } /* from on to off, do encoder then crtc */ if (mode > old_dpms) { if (encoder) drm_helper_encoder_dpms(encoder, encoder_dpms); if (crtc) { const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; if (crtc_funcs->dpms) (*crtc_funcs->dpms) (crtc, drm_helper_choose_crtc_dpms(crtc)); } } return 0; } EXPORT_SYMBOL(drm_helper_connector_dpms); /** * drm_helper_resume_force_mode - force-restore mode setting configuration * @dev: drm_device which should be restored * * Drivers which use the mode setting helpers can use this function to * force-restore the mode setting configuration e.g. on resume or when something * else might have trampled over the hw state (like some overzealous old BIOSen * tended to do). * * This helper doesn't provide a error return value since restoring the old * config should never fail due to resource allocation issues since the driver * has successfully set the restored configuration already. Hence this should * boil down to the equivalent of a few dpms on calls, which also don't provide * an error code. * * Drivers where simply restoring an old configuration again might fail (e.g. * due to slight differences in allocating shared resources when the * configuration is restored in a different order than when userspace set it up) * need to use their own restore logic. * * This function is deprecated. New drivers should implement atomic mode- * setting and use the atomic suspend/resume helpers. * * See also: * drm_atomic_helper_suspend(), drm_atomic_helper_resume() */ void drm_helper_resume_force_mode(struct drm_device *dev) { struct drm_crtc *crtc; struct drm_encoder *encoder; const struct drm_crtc_helper_funcs *crtc_funcs; int encoder_dpms; bool ret; WARN_ON(drm_drv_uses_atomic_modeset(dev)); drm_modeset_lock_all(dev); drm_for_each_crtc(crtc, dev) { if (!crtc->enabled) continue; ret = drm_crtc_helper_set_mode(crtc, &crtc->mode, crtc->x, crtc->y, crtc->primary->fb); /* Restoring the old config should never fail! */ if (ret == false) DRM_ERROR("failed to set mode on crtc %p\n", crtc); /* Turn off outputs that were already powered off */ if (drm_helper_choose_crtc_dpms(crtc)) { drm_for_each_encoder(encoder, dev) { if(encoder->crtc != crtc) continue; encoder_dpms = drm_helper_choose_encoder_dpms( encoder); drm_helper_encoder_dpms(encoder, encoder_dpms); } crtc_funcs = crtc->helper_private; if (crtc_funcs->dpms) (*crtc_funcs->dpms) (crtc, drm_helper_choose_crtc_dpms(crtc)); } } /* disable the unused connectors while restoring the modesetting */ __drm_helper_disable_unused_functions(dev); drm_modeset_unlock_all(dev); } EXPORT_SYMBOL(drm_helper_resume_force_mode); /** * drm_helper_force_disable_all - Forcibly turn off all enabled CRTCs * @dev: DRM device whose CRTCs to turn off * * Drivers may want to call this on unload to ensure that all displays are * unlit and the GPU is in a consistent, low power state. Takes modeset locks. * * Note: This should only be used by non-atomic legacy drivers. For an atomic * version look at drm_atomic_helper_shutdown(). * * Returns: * Zero on success, error code on failure. */ int drm_helper_force_disable_all(struct drm_device *dev) { struct drm_crtc *crtc; int ret = 0; drm_modeset_lock_all(dev); drm_for_each_crtc(crtc, dev) if (crtc->enabled) { struct drm_mode_set set = { .crtc = crtc, }; ret = drm_mode_set_config_internal(&set); if (ret) goto out; } out: drm_modeset_unlock_all(dev); return ret; } EXPORT_SYMBOL(drm_helper_force_disable_all); |
| 294 295 295 296 295 294 296 296 476 479 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 | // SPDX-License-Identifier: GPL-2.0-only /* * Link physical devices with ACPI devices support * * Copyright (c) 2005 David Shaohua Li <shaohua.li@intel.com> * Copyright (c) 2005 Intel Corp. */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi_iort.h> #include <linux/export.h> #include <linux/init.h> #include <linux/list.h> #include <linux/device.h> #include <linux/slab.h> #include <linux/rwsem.h> #include <linux/acpi.h> #include <linux/dma-mapping.h> #include <linux/pci.h> #include <linux/pci-acpi.h> #include <linux/platform_device.h> #include "internal.h" static LIST_HEAD(bus_type_list); static DECLARE_RWSEM(bus_type_sem); #define PHYSICAL_NODE_STRING "physical_node" #define PHYSICAL_NODE_NAME_SIZE (sizeof(PHYSICAL_NODE_STRING) + 10) int register_acpi_bus_type(struct acpi_bus_type *type) { if (acpi_disabled) return -ENODEV; if (type && type->match && type->find_companion) { down_write(&bus_type_sem); list_add_tail(&type->list, &bus_type_list); up_write(&bus_type_sem); pr_info("bus type %s registered\n", type->name); return 0; } return -ENODEV; } EXPORT_SYMBOL_GPL(register_acpi_bus_type); int unregister_acpi_bus_type(struct acpi_bus_type *type) { if (acpi_disabled) return 0; if (type) { down_write(&bus_type_sem); list_del_init(&type->list); up_write(&bus_type_sem); pr_info("bus type %s unregistered\n", type->name); return 0; } return -ENODEV; } EXPORT_SYMBOL_GPL(unregister_acpi_bus_type); static struct acpi_bus_type *acpi_get_bus_type(struct device *dev) { struct acpi_bus_type *tmp, *ret = NULL; down_read(&bus_type_sem); list_for_each_entry(tmp, &bus_type_list, list) { if (tmp->match(dev)) { ret = tmp; break; } } up_read(&bus_type_sem); return ret; } #define FIND_CHILD_MIN_SCORE 1 #define FIND_CHILD_MID_SCORE 2 #define FIND_CHILD_MAX_SCORE 3 static int match_any(struct acpi_device *adev, void *not_used) { return 1; } static bool acpi_dev_has_children(struct acpi_device *adev) { return acpi_dev_for_each_child(adev, match_any, NULL) > 0; } static int find_child_checks(struct acpi_device *adev, bool check_children) { unsigned long long sta; acpi_status status; if (check_children && !acpi_dev_has_children(adev)) return -ENODEV; status = acpi_evaluate_integer(adev->handle, "_STA", NULL, &sta); if (status == AE_NOT_FOUND) { /* * Special case: backlight device objects without _STA are * preferred to other objects with the same _ADR value, because * it is more likely that they are actually useful. */ if (adev->pnp.type.backlight) return FIND_CHILD_MID_SCORE; return FIND_CHILD_MIN_SCORE; } if (ACPI_FAILURE(status) || !(sta & ACPI_STA_DEVICE_ENABLED)) return -ENODEV; /* * If the device has a _HID returning a valid ACPI/PNP device ID, it is * better to make it look less attractive here, so that the other device * with the same _ADR value (that may not have a valid device ID) can be * matched going forward. [This means a second spec violation in a row, * so whatever we do here is best effort anyway.] */ if (adev->pnp.type.platform_id) return FIND_CHILD_MIN_SCORE; return FIND_CHILD_MAX_SCORE; } struct find_child_walk_data { struct acpi_device *adev; u64 address; int score; bool check_sta; bool check_children; }; static int check_one_child(struct acpi_device *adev, void *data) { struct find_child_walk_data *wd = data; int score; if (!adev->pnp.type.bus_address || acpi_device_adr(adev) != wd->address) return 0; if (!wd->adev) { /* * This is the first matching object, so save it. If it is not * necessary to look for any other matching objects, stop the * search. */ wd->adev = adev; return !(wd->check_sta || wd->check_children); } /* * There is more than one matching device object with the same _ADR * value. That really is unexpected, so we are kind of beyond the scope * of the spec here. We have to choose which one to return, though. * * First, get the score for the previously found object and terminate * the walk if it is maximum. */ if (!wd->score) { score = find_child_checks(wd->adev, wd->check_children); if (score == FIND_CHILD_MAX_SCORE) return 1; wd->score = score; } /* * Second, if the object that has just been found has a better score, * replace the previously found one with it and terminate the walk if * the new score is maximum. */ score = find_child_checks(adev, wd->check_children); if (score > wd->score) { wd->adev = adev; if (score == FIND_CHILD_MAX_SCORE) return 1; wd->score = score; } /* Continue, because there may be better matches. */ return 0; } static struct acpi_device *acpi_find_child(struct acpi_device *parent, u64 address, bool check_children, bool check_sta) { struct find_child_walk_data wd = { .address = address, .check_children = check_children, .check_sta = check_sta, .adev = NULL, .score = 0, }; if (parent) acpi_dev_for_each_child(parent, check_one_child, &wd); return wd.adev; } struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children) { return acpi_find_child(parent, address, check_children, true); } EXPORT_SYMBOL_GPL(acpi_find_child_device); struct acpi_device *acpi_find_child_by_adr(struct acpi_device *adev, acpi_bus_address adr) { return acpi_find_child(adev, adr, false, false); } EXPORT_SYMBOL_GPL(acpi_find_child_by_adr); static void acpi_physnode_link_name(char *buf, unsigned int node_id) { if (node_id > 0) snprintf(buf, PHYSICAL_NODE_NAME_SIZE, PHYSICAL_NODE_STRING "%u", node_id); else strcpy(buf, PHYSICAL_NODE_STRING); } int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev) { struct acpi_device_physical_node *physical_node, *pn; char physical_node_name[PHYSICAL_NODE_NAME_SIZE]; struct list_head *physnode_list; unsigned int node_id; int retval = -EINVAL; if (has_acpi_companion(dev)) { if (acpi_dev) { dev_warn(dev, "ACPI companion already set\n"); return -EINVAL; } else { acpi_dev = ACPI_COMPANION(dev); } } if (!acpi_dev) return -EINVAL; acpi_dev_get(acpi_dev); get_device(dev); physical_node = kzalloc(sizeof(*physical_node), GFP_KERNEL); if (!physical_node) { retval = -ENOMEM; goto err; } mutex_lock(&acpi_dev->physical_node_lock); /* * Keep the list sorted by node_id so that the IDs of removed nodes can * be recycled easily. */ physnode_list = &acpi_dev->physical_node_list; node_id = 0; list_for_each_entry(pn, &acpi_dev->physical_node_list, node) { /* Sanity check. */ if (pn->dev == dev) { mutex_unlock(&acpi_dev->physical_node_lock); dev_warn(dev, "Already associated with ACPI node\n"); kfree(physical_node); if (ACPI_COMPANION(dev) != acpi_dev) goto err; put_device(dev); acpi_dev_put(acpi_dev); return 0; } if (pn->node_id == node_id) { physnode_list = &pn->node; node_id++; } } physical_node->node_id = node_id; physical_node->dev = dev; list_add(&physical_node->node, physnode_list); acpi_dev->physical_node_count++; if (!has_acpi_companion(dev)) ACPI_COMPANION_SET(dev, acpi_dev); acpi_physnode_link_name(physical_node_name, node_id); retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj, physical_node_name); if (retval) dev_err(&acpi_dev->dev, "Failed to create link %s (%d)\n", physical_node_name, retval); retval = sysfs_create_link(&dev->kobj, &acpi_dev->dev.kobj, "firmware_node"); if (retval) dev_err(dev, "Failed to create link firmware_node (%d)\n", retval); mutex_unlock(&acpi_dev->physical_node_lock); if (acpi_dev->wakeup.flags.valid) device_set_wakeup_capable(dev, true); return 0; err: ACPI_COMPANION_SET(dev, NULL); put_device(dev); acpi_dev_put(acpi_dev); return retval; } EXPORT_SYMBOL_GPL(acpi_bind_one); int acpi_unbind_one(struct device *dev) { struct acpi_device *acpi_dev = ACPI_COMPANION(dev); struct acpi_device_physical_node *entry; if (!acpi_dev) return 0; mutex_lock(&acpi_dev->physical_node_lock); list_for_each_entry(entry, &acpi_dev->physical_node_list, node) if (entry->dev == dev) { char physnode_name[PHYSICAL_NODE_NAME_SIZE]; list_del(&entry->node); acpi_dev->physical_node_count--; acpi_physnode_link_name(physnode_name, entry->node_id); sysfs_remove_link(&acpi_dev->dev.kobj, physnode_name); sysfs_remove_link(&dev->kobj, "firmware_node"); ACPI_COMPANION_SET(dev, NULL); /* Drop references taken by acpi_bind_one(). */ put_device(dev); acpi_dev_put(acpi_dev); kfree(entry); break; } mutex_unlock(&acpi_dev->physical_node_lock); return 0; } EXPORT_SYMBOL_GPL(acpi_unbind_one); void acpi_device_notify(struct device *dev) { struct acpi_device *adev; int ret; ret = acpi_bind_one(dev, NULL); if (ret) { struct acpi_bus_type *type = acpi_get_bus_type(dev); if (!type) goto err; adev = type->find_companion(dev); if (!adev) { dev_dbg(dev, "ACPI companion not found\n"); goto err; } ret = acpi_bind_one(dev, adev); if (ret) goto err; if (type->setup) { type->setup(dev); goto done; } } else { adev = ACPI_COMPANION(dev); if (dev_is_pci(dev)) { pci_acpi_setup(dev, adev); goto done; } else if (dev_is_platform(dev)) { acpi_configure_pmsi_domain(dev); } } if (adev->handler && adev->handler->bind) adev->handler->bind(dev); done: acpi_handle_debug(ACPI_HANDLE(dev), "Bound to device %s\n", dev_name(dev)); return; err: dev_dbg(dev, "No ACPI support\n"); } void acpi_device_notify_remove(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); if (!adev) return; if (dev_is_pci(dev)) pci_acpi_cleanup(dev, adev); else if (adev->handler && adev->handler->unbind) adev->handler->unbind(dev); acpi_unbind_one(dev); } |
| 15 14 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sched #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include <linux/kthread.h> #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ TRACE_EVENT(sched_kthread_stop, TP_PROTO(struct task_struct *t), TP_ARGS(t), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) ), TP_fast_assign( memcpy(__entry->comm, t->comm, TASK_COMM_LEN); __entry->pid = t->pid; ), TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) ); /* * Tracepoint for the return value of the kthread stopping: */ TRACE_EVENT(sched_kthread_stop_ret, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field( int, ret ) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); /** * sched_kthread_work_queue_work - called when a work gets queued * @worker: pointer to the kthread_worker * @work: pointer to struct kthread_work * * This event occurs when a work is queued immediately or once a * delayed work is actually queued (ie: once the delay has been * reached). */ TRACE_EVENT(sched_kthread_work_queue_work, TP_PROTO(struct kthread_worker *worker, struct kthread_work *work), TP_ARGS(worker, work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) __field( void *, worker) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; __entry->worker = worker; ), TP_printk("work struct=%p function=%ps worker=%p", __entry->work, __entry->function, __entry->worker) ); /** * sched_kthread_work_execute_start - called immediately before the work callback * @work: pointer to struct kthread_work * * Allows to track kthread work execution. */ TRACE_EVENT(sched_kthread_work_execute_start, TP_PROTO(struct kthread_work *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** * sched_kthread_work_execute_end - called immediately after the work callback * @work: pointer to struct work_struct * @function: pointer to worker function * * Allows to track workqueue execution. */ TRACE_EVENT(sched_kthread_work_execute_end, TP_PROTO(struct kthread_work *work, kthread_work_func_t function), TP_ARGS(work, function), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = function; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /* * Tracepoint for waking up a task: */ DECLARE_EVENT_CLASS(sched_wakeup_template, TP_PROTO(struct task_struct *p), TP_ARGS(__perf_task(p)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, target_cpu ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->target_cpu = task_cpu(p); ), TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d", __entry->comm, __entry->pid, __entry->prio, __entry->target_cpu) ); /* * Tracepoint called when waking a task; this tracepoint is guaranteed to be * called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_waking, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING. * It is not always called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for waking up a new task: */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_PROTO(struct task_struct *p), TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS static inline long __trace_sched_switch_state(bool preempt, unsigned int prev_state, struct task_struct *p) { unsigned int state; #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ /* * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ if (preempt) return TASK_REPORT_MAX; /* * task_state_index() uses fls() and returns a value from 0-8 range. * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using * it for left shift operation to get the correct task->state * mapping. */ state = __task_state_index(prev_state, p->exit_state); return state ? (1 << (state - 1)) : state; } #endif /* CREATE_TRACE_POINTS */ /* * Tracepoint for task switches, performed by the scheduler: */ TRACE_EVENT(sched_switch, TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state), TP_ARGS(preempt, prev, next, prev_state), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) __field( int, prev_prio ) __field( long, prev_state ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) __field( int, next_prio ) ), TP_fast_assign( memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { TASK_INTERRUPTIBLE, "S" }, { TASK_UNINTERRUPTIBLE, "D" }, { __TASK_STOPPED, "T" }, { __TASK_TRACED, "t" }, { EXIT_DEAD, "X" }, { EXIT_ZOMBIE, "Z" }, { TASK_PARKED, "P" }, { TASK_DEAD, "I" }) : "R", __entry->prev_state & TASK_REPORT_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); /* * Tracepoint for a task being migrated: */ TRACE_EVENT(sched_migrate_task, TP_PROTO(struct task_struct *p, int dest_cpu), TP_ARGS(p, dest_cpu), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, orig_cpu ) __field( int, dest_cpu ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d", __entry->comm, __entry->pid, __entry->prio, __entry->orig_cpu, __entry->dest_cpu) ); DECLARE_EVENT_CLASS(sched_process_template, TP_PROTO(struct task_struct *p), TP_ARGS(p), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", __entry->comm, __entry->pid, __entry->prio) ); /* * Tracepoint for freeing a task: */ DEFINE_EVENT(sched_process_template, sched_process_free, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for a task exiting: */ DEFINE_EVENT(sched_process_template, sched_process_exit, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for waiting on task to unschedule: */ DEFINE_EVENT(sched_process_template, sched_wait_task, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for a waiting task: */ TRACE_EVENT(sched_process_wait, TP_PROTO(struct pid *pid), TP_ARGS(pid), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); __entry->pid = pid_nr(pid); __entry->prio = current->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", __entry->comm, __entry->pid, __entry->prio) ); /* * Tracepoint for kernel_clone: */ TRACE_EVENT(sched_process_fork, TP_PROTO(struct task_struct *parent, struct task_struct *child), TP_ARGS(parent, child), TP_STRUCT__entry( __array( char, parent_comm, TASK_COMM_LEN ) __field( pid_t, parent_pid ) __array( char, child_comm, TASK_COMM_LEN ) __field( pid_t, child_pid ) ), TP_fast_assign( memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); __entry->parent_pid = parent->pid; memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); __entry->child_pid = child->pid; ), TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d", __entry->parent_comm, __entry->parent_pid, __entry->child_comm, __entry->child_pid) ); /* * Tracepoint for exec: */ TRACE_EVENT(sched_process_exec, TP_PROTO(struct task_struct *p, pid_t old_pid, struct linux_binprm *bprm), TP_ARGS(p, old_pid, bprm), TP_STRUCT__entry( __string( filename, bprm->filename ) __field( pid_t, pid ) __field( pid_t, old_pid ) ), TP_fast_assign( __assign_str(filename, bprm->filename); __entry->pid = p->pid; __entry->old_pid = old_pid; ), TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename), __entry->pid, __entry->old_pid) ); #ifdef CONFIG_SCHEDSTATS #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS #else #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP #endif /* * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE * adding sched_stat support to SCHED_FIFO/RR would be welcome. */ DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(__perf_task(tsk), __perf_count(delay)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, delay ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->delay = delay; ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", __entry->comm, __entry->pid, (unsigned long long)__entry->delay) ); /* * Tracepoint for accounting wait time (time the task is runnable * but not actually running due to scheduler contention). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting sleep time (time the task is not runnable, * including iowait, see below). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting iowait time (time the task is not runnable * due to waiting on IO to complete). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting blocked time (time the task is in uninterruptible). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting runtime (time the task is executing * on a CPU). */ DECLARE_EVENT_CLASS(sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime), TP_ARGS(tsk, __perf_count(runtime)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, runtime ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->runtime = runtime; ), TP_printk("comm=%s pid=%d runtime=%Lu [ns]", __entry->comm, __entry->pid, (unsigned long long)__entry->runtime) ); DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime), TP_ARGS(tsk, runtime)); /* * Tracepoint for showing priority inheritance modifying a tasks * priority. */ TRACE_EVENT(sched_pi_setprio, TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task), TP_ARGS(tsk, pi_task), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, oldprio ) __field( int, newprio ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->oldprio = tsk->prio; __entry->newprio = pi_task ? min(tsk->normal_prio, pi_task->prio) : tsk->normal_prio; /* XXX SCHED_DEADLINE bits missing */ ), TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", __entry->comm, __entry->pid, __entry->oldprio, __entry->newprio) ); #ifdef CONFIG_DETECT_HUNG_TASK TRACE_EVENT(sched_process_hang, TP_PROTO(struct task_struct *tsk), TP_ARGS(tsk), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; ), TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) ); #endif /* CONFIG_DETECT_HUNG_TASK */ /* * Tracks migration of tasks from one runqueue to another. Can be used to * detect if automatic NUMA balancing is bouncing between nodes. */ TRACE_EVENT(sched_move_numa, TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), TP_ARGS(tsk, src_cpu, dst_cpu), TP_STRUCT__entry( __field( pid_t, pid ) __field( pid_t, tgid ) __field( pid_t, ngid ) __field( int, src_cpu ) __field( int, src_nid ) __field( int, dst_cpu ) __field( int, dst_nid ) ), TP_fast_assign( __entry->pid = task_pid_nr(tsk); __entry->tgid = task_tgid_nr(tsk); __entry->ngid = task_numa_group_id(tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); __entry->dst_cpu = dst_cpu; __entry->dst_nid = cpu_to_node(dst_cpu); ), TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", __entry->pid, __entry->tgid, __entry->ngid, __entry->src_cpu, __entry->src_nid, __entry->dst_cpu, __entry->dst_nid) ); DECLARE_EVENT_CLASS(sched_numa_pair_template, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), TP_STRUCT__entry( __field( pid_t, src_pid ) __field( pid_t, src_tgid ) __field( pid_t, src_ngid ) __field( int, src_cpu ) __field( int, src_nid ) __field( pid_t, dst_pid ) __field( pid_t, dst_tgid ) __field( pid_t, dst_ngid ) __field( int, dst_cpu ) __field( int, dst_nid ) ), TP_fast_assign( __entry->src_pid = task_pid_nr(src_tsk); __entry->src_tgid = task_tgid_nr(src_tsk); __entry->src_ngid = task_numa_group_id(src_tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); __entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0; __entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0; __entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0; __entry->dst_cpu = dst_cpu; __entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1; ), TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", __entry->src_pid, __entry->src_tgid, __entry->src_ngid, __entry->src_cpu, __entry->src_nid, __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, __entry->dst_cpu, __entry->dst_nid) ); DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); #ifdef CONFIG_NUMA_BALANCING #define NUMAB_SKIP_REASON \ EM( NUMAB_SKIP_UNSUITABLE, "unsuitable" ) \ EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \ EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" ) /* Redefine for export. */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); NUMAB_SKIP_REASON /* Redefine for symbolic printing. */ #undef EM #undef EMe #define EM(a, b) { a, b }, #define EMe(a, b) { a, b } TRACE_EVENT(sched_skip_vma_numa, TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma, enum numa_vmaskip_reason reason), TP_ARGS(mm, vma, reason), TP_STRUCT__entry( __field(unsigned long, numa_scan_offset) __field(unsigned long, vm_start) __field(unsigned long, vm_end) __field(enum numa_vmaskip_reason, reason) ), TP_fast_assign( __entry->numa_scan_offset = mm->numa_scan_offset; __entry->vm_start = vma->vm_start; __entry->vm_end = vma->vm_end; __entry->reason = reason; ), TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s", __entry->numa_scan_offset, __entry->vm_start, __entry->vm_end, __print_symbolic(__entry->reason, NUMAB_SKIP_REASON)) ); #endif /* CONFIG_NUMA_BALANCING */ /* * Tracepoint for waking a polling cpu without an IPI. */ TRACE_EVENT(sched_wake_idle_without_ipi, TP_PROTO(int cpu), TP_ARGS(cpu), TP_STRUCT__entry( __field( int, cpu ) ), TP_fast_assign( __entry->cpu = cpu; ), TP_printk("cpu=%d", __entry->cpu) ); /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. * * Postfixed with _tp to make them easily identifiable in the code. */ DECLARE_TRACE(pelt_cfs_tp, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); DECLARE_TRACE(pelt_rt_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_thermal_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_irq_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); DECLARE_TRACE(sched_cpu_capacity_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); DECLARE_TRACE(sched_util_est_cfs_tp, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); DECLARE_TRACE(sched_util_est_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); DECLARE_TRACE(sched_compute_energy_tp, TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy, unsigned long max_util, unsigned long busy_time), TP_ARGS(p, dst_cpu, energy, max_util, busy_time)); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 6 6 6 5 5 5 5 5 3 3 3 3 3 3 3 19 19 19 19 19 19 18 1 1 1 1 19 14 14 13 13 11 13 11 14 11 11 11 11 11 11 18 1 15 15 15 15 1 1 1 1 14 14 14 1 14 18 18 18 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 | /* * usbmidi.c - ALSA USB MIDI driver * * Copyright (c) 2002-2009 Clemens Ladisch * All rights reserved. * * Based on the OSS usb-midi driver by NAGANO Daisuke, * NetBSD's umidi driver by Takuya SHIOZAKI, * the "USB Device Class Definition for MIDI Devices" by Roland * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * Alternatively, this software may be distributed and/or modified under the * terms of the GNU General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) any later * version. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/usb.h> #include <linux/wait.h> #include <linux/usb/audio.h> #include <linux/usb/midi.h> #include <linux/module.h> #include <sound/core.h> #include <sound/control.h> #include <sound/rawmidi.h> #include <sound/asequencer.h> #include "usbaudio.h" #include "midi.h" #include "power.h" #include "helper.h" /* * define this to log all USB packets */ /* #define DUMP_PACKETS */ /* * how long to wait after some USB errors, so that hub_wq can disconnect() us * without too many spurious errors */ #define ERROR_DELAY_JIFFIES (HZ / 10) #define OUTPUT_URBS 7 #define INPUT_URBS 7 MODULE_AUTHOR("Clemens Ladisch <clemens@ladisch.de>"); MODULE_DESCRIPTION("USB Audio/MIDI helper module"); MODULE_LICENSE("Dual BSD/GPL"); struct snd_usb_midi_in_endpoint; struct snd_usb_midi_out_endpoint; struct snd_usb_midi_endpoint; struct usb_protocol_ops { void (*input)(struct snd_usb_midi_in_endpoint*, uint8_t*, int); void (*output)(struct snd_usb_midi_out_endpoint *ep, struct urb *urb); void (*output_packet)(struct urb*, uint8_t, uint8_t, uint8_t, uint8_t); void (*init_out_endpoint)(struct snd_usb_midi_out_endpoint *); void (*finish_out_endpoint)(struct snd_usb_midi_out_endpoint *); }; struct snd_usb_midi { struct usb_device *dev; struct snd_card *card; struct usb_interface *iface; const struct snd_usb_audio_quirk *quirk; struct snd_rawmidi *rmidi; const struct usb_protocol_ops *usb_protocol_ops; struct list_head list; struct timer_list error_timer; spinlock_t disc_lock; struct rw_semaphore disc_rwsem; struct mutex mutex; u32 usb_id; int next_midi_device; struct snd_usb_midi_endpoint { struct snd_usb_midi_out_endpoint *out; struct snd_usb_midi_in_endpoint *in; } endpoints[MIDI_MAX_ENDPOINTS]; unsigned long input_triggered; unsigned int opened[2]; unsigned char disconnected; unsigned char input_running; struct snd_kcontrol *roland_load_ctl; }; struct snd_usb_midi_out_endpoint { struct snd_usb_midi *umidi; struct out_urb_context { struct urb *urb; struct snd_usb_midi_out_endpoint *ep; } urbs[OUTPUT_URBS]; unsigned int active_urbs; unsigned int drain_urbs; int max_transfer; /* size of urb buffer */ struct work_struct work; unsigned int next_urb; spinlock_t buffer_lock; struct usbmidi_out_port { struct snd_usb_midi_out_endpoint *ep; struct snd_rawmidi_substream *substream; int active; uint8_t cable; /* cable number << 4 */ uint8_t state; #define STATE_UNKNOWN 0 #define STATE_1PARAM 1 #define STATE_2PARAM_1 2 #define STATE_2PARAM_2 3 #define STATE_SYSEX_0 4 #define STATE_SYSEX_1 5 #define STATE_SYSEX_2 6 uint8_t data[2]; } ports[0x10]; int current_port; wait_queue_head_t drain_wait; }; struct snd_usb_midi_in_endpoint { struct snd_usb_midi *umidi; struct urb *urbs[INPUT_URBS]; struct usbmidi_in_port { struct snd_rawmidi_substream *substream; u8 running_status_length; } ports[0x10]; u8 seen_f5; bool in_sysex; u8 last_cin; u8 error_resubmit; int current_port; }; static void snd_usbmidi_do_output(struct snd_usb_midi_out_endpoint *ep); static const uint8_t snd_usbmidi_cin_length[] = { 0, 0, 2, 3, 3, 1, 2, 3, 3, 3, 3, 3, 2, 2, 3, 1 }; /* * Submits the URB, with error handling. */ static int snd_usbmidi_submit_urb(struct urb *urb, gfp_t flags) { int err = usb_submit_urb(urb, flags); if (err < 0 && err != -ENODEV) dev_err(&urb->dev->dev, "usb_submit_urb: %d\n", err); return err; } /* * Error handling for URB completion functions. */ static int snd_usbmidi_urb_error(const struct urb *urb) { switch (urb->status) { /* manually unlinked, or device gone */ case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: case -ENODEV: return -ENODEV; /* errors that might occur during unplugging */ case -EPROTO: case -ETIME: case -EILSEQ: return -EIO; default: dev_err(&urb->dev->dev, "urb status %d\n", urb->status); return 0; /* continue */ } } /* * Receives a chunk of MIDI data. */ static void snd_usbmidi_input_data(struct snd_usb_midi_in_endpoint *ep, int portidx, uint8_t *data, int length) { struct usbmidi_in_port *port = &ep->ports[portidx]; if (!port->substream) { dev_dbg(&ep->umidi->dev->dev, "unexpected port %d!\n", portidx); return; } if (!test_bit(port->substream->number, &ep->umidi->input_triggered)) return; snd_rawmidi_receive(port->substream, data, length); } #ifdef DUMP_PACKETS static void dump_urb(const char *type, const u8 *data, int length) { snd_printk(KERN_DEBUG "%s packet: [", type); for (; length > 0; ++data, --length) printk(KERN_CONT " %02x", *data); printk(KERN_CONT " ]\n"); } #else #define dump_urb(type, data, length) /* nothing */ #endif /* * Processes the data read from the device. */ static void snd_usbmidi_in_urb_complete(struct urb *urb) { struct snd_usb_midi_in_endpoint *ep = urb->context; if (urb->status == 0) { dump_urb("received", urb->transfer_buffer, urb->actual_length); ep->umidi->usb_protocol_ops->input(ep, urb->transfer_buffer, urb->actual_length); } else { int err = snd_usbmidi_urb_error(urb); if (err < 0) { if (err != -ENODEV) { ep->error_resubmit = 1; mod_timer(&ep->umidi->error_timer, jiffies + ERROR_DELAY_JIFFIES); } return; } } urb->dev = ep->umidi->dev; snd_usbmidi_submit_urb(urb, GFP_ATOMIC); } static void snd_usbmidi_out_urb_complete(struct urb *urb) { struct out_urb_context *context = urb->context; struct snd_usb_midi_out_endpoint *ep = context->ep; unsigned int urb_index; unsigned long flags; spin_lock_irqsave(&ep->buffer_lock, flags); urb_index = context - ep->urbs; ep->active_urbs &= ~(1 << urb_index); if (unlikely(ep->drain_urbs)) { ep->drain_urbs &= ~(1 << urb_index); wake_up(&ep->drain_wait); } spin_unlock_irqrestore(&ep->buffer_lock, flags); if (urb->status < 0) { int err = snd_usbmidi_urb_error(urb); if (err < 0) { if (err != -ENODEV) mod_timer(&ep->umidi->error_timer, jiffies + ERROR_DELAY_JIFFIES); return; } } snd_usbmidi_do_output(ep); } /* * This is called when some data should be transferred to the device * (from one or more substreams). */ static void snd_usbmidi_do_output(struct snd_usb_midi_out_endpoint *ep) { unsigned int urb_index; struct urb *urb; unsigned long flags; spin_lock_irqsave(&ep->buffer_lock, flags); if (ep->umidi->disconnected) { spin_unlock_irqrestore(&ep->buffer_lock, flags); return; } urb_index = ep->next_urb; for (;;) { if (!(ep->active_urbs & (1 << urb_index))) { urb = ep->urbs[urb_index].urb; urb->transfer_buffer_length = 0; ep->umidi->usb_protocol_ops->output(ep, urb); if (urb->transfer_buffer_length == 0) break; dump_urb("sending", urb->transfer_buffer, urb->transfer_buffer_length); urb->dev = ep->umidi->dev; if (snd_usbmidi_submit_urb(urb, GFP_ATOMIC) < 0) break; ep->active_urbs |= 1 << urb_index; } if (++urb_index >= OUTPUT_URBS) urb_index = 0; if (urb_index == ep->next_urb) break; } ep->next_urb = urb_index; spin_unlock_irqrestore(&ep->buffer_lock, flags); } static void snd_usbmidi_out_work(struct work_struct *work) { struct snd_usb_midi_out_endpoint *ep = container_of(work, struct snd_usb_midi_out_endpoint, work); snd_usbmidi_do_output(ep); } /* called after transfers had been interrupted due to some USB error */ static void snd_usbmidi_error_timer(struct timer_list *t) { struct snd_usb_midi *umidi = from_timer(umidi, t, error_timer); unsigned int i, j; spin_lock(&umidi->disc_lock); if (umidi->disconnected) { spin_unlock(&umidi->disc_lock); return; } for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { struct snd_usb_midi_in_endpoint *in = umidi->endpoints[i].in; if (in && in->error_resubmit) { in->error_resubmit = 0; for (j = 0; j < INPUT_URBS; ++j) { if (atomic_read(&in->urbs[j]->use_count)) continue; in->urbs[j]->dev = umidi->dev; snd_usbmidi_submit_urb(in->urbs[j], GFP_ATOMIC); } } if (umidi->endpoints[i].out) snd_usbmidi_do_output(umidi->endpoints[i].out); } spin_unlock(&umidi->disc_lock); } /* helper function to send static data that may not DMA-able */ static int send_bulk_static_data(struct snd_usb_midi_out_endpoint *ep, const void *data, int len) { int err = 0; void *buf = kmemdup(data, len, GFP_KERNEL); if (!buf) return -ENOMEM; dump_urb("sending", buf, len); if (ep->urbs[0].urb) err = usb_bulk_msg(ep->umidi->dev, ep->urbs[0].urb->pipe, buf, len, NULL, 250); kfree(buf); return err; } /* * Standard USB MIDI protocol: see the spec. * Midiman protocol: like the standard protocol, but the control byte is the * fourth byte in each packet, and uses length instead of CIN. */ static void snd_usbmidi_standard_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { int i; for (i = 0; i + 3 < buffer_length; i += 4) if (buffer[i] != 0) { int cable = buffer[i] >> 4; int length = snd_usbmidi_cin_length[buffer[i] & 0x0f]; snd_usbmidi_input_data(ep, cable, &buffer[i + 1], length); } } static void snd_usbmidi_midiman_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { int i; for (i = 0; i + 3 < buffer_length; i += 4) if (buffer[i + 3] != 0) { int port = buffer[i + 3] >> 4; int length = buffer[i + 3] & 3; snd_usbmidi_input_data(ep, port, &buffer[i], length); } } /* * Buggy M-Audio device: running status on input results in a packet that has * the data bytes but not the status byte and that is marked with CIN 4. */ static void snd_usbmidi_maudio_broken_running_status_input( struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { int i; for (i = 0; i + 3 < buffer_length; i += 4) if (buffer[i] != 0) { int cable = buffer[i] >> 4; u8 cin = buffer[i] & 0x0f; struct usbmidi_in_port *port = &ep->ports[cable]; int length; length = snd_usbmidi_cin_length[cin]; if (cin == 0xf && buffer[i + 1] >= 0xf8) ; /* realtime msg: no running status change */ else if (cin >= 0x8 && cin <= 0xe) /* channel msg */ port->running_status_length = length - 1; else if (cin == 0x4 && port->running_status_length != 0 && buffer[i + 1] < 0x80) /* CIN 4 that is not a SysEx */ length = port->running_status_length; else /* * All other msgs cannot begin running status. * (A channel msg sent as two or three CIN 0xF * packets could in theory, but this device * doesn't use this format.) */ port->running_status_length = 0; snd_usbmidi_input_data(ep, cable, &buffer[i + 1], length); } } /* * QinHeng CH345 is buggy: every second packet inside a SysEx has not CIN 4 * but the previously seen CIN, but still with three data bytes. */ static void ch345_broken_sysex_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { unsigned int i, cin, length; for (i = 0; i + 3 < buffer_length; i += 4) { if (buffer[i] == 0 && i > 0) break; cin = buffer[i] & 0x0f; if (ep->in_sysex && cin == ep->last_cin && (buffer[i + 1 + (cin == 0x6)] & 0x80) == 0) cin = 0x4; #if 0 if (buffer[i + 1] == 0x90) { /* * Either a corrupted running status or a real note-on * message; impossible to detect reliably. */ } #endif length = snd_usbmidi_cin_length[cin]; snd_usbmidi_input_data(ep, 0, &buffer[i + 1], length); ep->in_sysex = cin == 0x4; if (!ep->in_sysex) ep->last_cin = cin; } } /* * CME protocol: like the standard protocol, but SysEx commands are sent as a * single USB packet preceded by a 0x0F byte. */ static void snd_usbmidi_cme_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { if (buffer_length < 2 || (buffer[0] & 0x0f) != 0x0f) snd_usbmidi_standard_input(ep, buffer, buffer_length); else snd_usbmidi_input_data(ep, buffer[0] >> 4, &buffer[1], buffer_length - 1); } /* * Adds one USB MIDI packet to the output buffer. */ static void snd_usbmidi_output_standard_packet(struct urb *urb, uint8_t p0, uint8_t p1, uint8_t p2, uint8_t p3) { uint8_t *buf = (uint8_t *)urb->transfer_buffer + urb->transfer_buffer_length; buf[0] = p0; buf[1] = p1; buf[2] = p2; buf[3] = p3; urb->transfer_buffer_length += 4; } /* * Adds one Midiman packet to the output buffer. */ static void snd_usbmidi_output_midiman_packet(struct urb *urb, uint8_t p0, uint8_t p1, uint8_t p2, uint8_t p3) { uint8_t *buf = (uint8_t *)urb->transfer_buffer + urb->transfer_buffer_length; buf[0] = p1; buf[1] = p2; buf[2] = p3; buf[3] = (p0 & 0xf0) | snd_usbmidi_cin_length[p0 & 0x0f]; urb->transfer_buffer_length += 4; } /* * Converts MIDI commands to USB MIDI packets. */ static void snd_usbmidi_transmit_byte(struct usbmidi_out_port *port, uint8_t b, struct urb *urb) { uint8_t p0 = port->cable; void (*output_packet)(struct urb*, uint8_t, uint8_t, uint8_t, uint8_t) = port->ep->umidi->usb_protocol_ops->output_packet; if (b >= 0xf8) { output_packet(urb, p0 | 0x0f, b, 0, 0); } else if (b >= 0xf0) { switch (b) { case 0xf0: port->data[0] = b; port->state = STATE_SYSEX_1; break; case 0xf1: case 0xf3: port->data[0] = b; port->state = STATE_1PARAM; break; case 0xf2: port->data[0] = b; port->state = STATE_2PARAM_1; break; case 0xf4: case 0xf5: port->state = STATE_UNKNOWN; break; case 0xf6: output_packet(urb, p0 | 0x05, 0xf6, 0, 0); port->state = STATE_UNKNOWN; break; case 0xf7: switch (port->state) { case STATE_SYSEX_0: output_packet(urb, p0 | 0x05, 0xf7, 0, 0); break; case STATE_SYSEX_1: output_packet(urb, p0 | 0x06, port->data[0], 0xf7, 0); break; case STATE_SYSEX_2: output_packet(urb, p0 | 0x07, port->data[0], port->data[1], 0xf7); break; } port->state = STATE_UNKNOWN; break; } } else if (b >= 0x80) { port->data[0] = b; if (b >= 0xc0 && b <= 0xdf) port->state = STATE_1PARAM; else port->state = STATE_2PARAM_1; } else { /* b < 0x80 */ switch (port->state) { case STATE_1PARAM: if (port->data[0] < 0xf0) { p0 |= port->data[0] >> 4; } else { p0 |= 0x02; port->state = STATE_UNKNOWN; } output_packet(urb, p0, port->data[0], b, 0); break; case STATE_2PARAM_1: port->data[1] = b; port->state = STATE_2PARAM_2; break; case STATE_2PARAM_2: if (port->data[0] < 0xf0) { p0 |= port->data[0] >> 4; port->state = STATE_2PARAM_1; } else { p0 |= 0x03; port->state = STATE_UNKNOWN; } output_packet(urb, p0, port->data[0], port->data[1], b); break; case STATE_SYSEX_0: port->data[0] = b; port->state = STATE_SYSEX_1; break; case STATE_SYSEX_1: port->data[1] = b; port->state = STATE_SYSEX_2; break; case STATE_SYSEX_2: output_packet(urb, p0 | 0x04, port->data[0], port->data[1], b); port->state = STATE_SYSEX_0; break; } } } static void snd_usbmidi_standard_output(struct snd_usb_midi_out_endpoint *ep, struct urb *urb) { int p; /* FIXME: lower-numbered ports can starve higher-numbered ports */ for (p = 0; p < 0x10; ++p) { struct usbmidi_out_port *port = &ep->ports[p]; if (!port->active) continue; while (urb->transfer_buffer_length + 3 < ep->max_transfer) { uint8_t b; if (snd_rawmidi_transmit(port->substream, &b, 1) != 1) { port->active = 0; break; } snd_usbmidi_transmit_byte(port, b, urb); } } } static const struct usb_protocol_ops snd_usbmidi_standard_ops = { .input = snd_usbmidi_standard_input, .output = snd_usbmidi_standard_output, .output_packet = snd_usbmidi_output_standard_packet, }; static const struct usb_protocol_ops snd_usbmidi_midiman_ops = { .input = snd_usbmidi_midiman_input, .output = snd_usbmidi_standard_output, .output_packet = snd_usbmidi_output_midiman_packet, }; static const struct usb_protocol_ops snd_usbmidi_maudio_broken_running_status_ops = { .input = snd_usbmidi_maudio_broken_running_status_input, .output = snd_usbmidi_standard_output, .output_packet = snd_usbmidi_output_standard_packet, }; static const struct usb_protocol_ops snd_usbmidi_cme_ops = { .input = snd_usbmidi_cme_input, .output = snd_usbmidi_standard_output, .output_packet = snd_usbmidi_output_standard_packet, }; static const struct usb_protocol_ops snd_usbmidi_ch345_broken_sysex_ops = { .input = ch345_broken_sysex_input, .output = snd_usbmidi_standard_output, .output_packet = snd_usbmidi_output_standard_packet, }; /* * AKAI MPD16 protocol: * * For control port (endpoint 1): * ============================== * One or more chunks consisting of first byte of (0x10 | msg_len) and then a * SysEx message (msg_len=9 bytes long). * * For data port (endpoint 2): * =========================== * One or more chunks consisting of first byte of (0x20 | msg_len) and then a * MIDI message (msg_len bytes long) * * Messages sent: Active Sense, Note On, Poly Pressure, Control Change. */ static void snd_usbmidi_akai_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { unsigned int pos = 0; unsigned int len = (unsigned int)buffer_length; while (pos < len) { unsigned int port = (buffer[pos] >> 4) - 1; unsigned int msg_len = buffer[pos] & 0x0f; pos++; if (pos + msg_len <= len && port < 2) snd_usbmidi_input_data(ep, 0, &buffer[pos], msg_len); pos += msg_len; } } #define MAX_AKAI_SYSEX_LEN 9 static void snd_usbmidi_akai_output(struct snd_usb_midi_out_endpoint *ep, struct urb *urb) { uint8_t *msg; int pos, end, count, buf_end; uint8_t tmp[MAX_AKAI_SYSEX_LEN]; struct snd_rawmidi_substream *substream = ep->ports[0].substream; if (!ep->ports[0].active) return; msg = urb->transfer_buffer + urb->transfer_buffer_length; buf_end = ep->max_transfer - MAX_AKAI_SYSEX_LEN - 1; /* only try adding more data when there's space for at least 1 SysEx */ while (urb->transfer_buffer_length < buf_end) { count = snd_rawmidi_transmit_peek(substream, tmp, MAX_AKAI_SYSEX_LEN); if (!count) { ep->ports[0].active = 0; return; } /* try to skip non-SysEx data */ for (pos = 0; pos < count && tmp[pos] != 0xF0; pos++) ; if (pos > 0) { snd_rawmidi_transmit_ack(substream, pos); continue; } /* look for the start or end marker */ for (end = 1; end < count && tmp[end] < 0xF0; end++) ; /* next SysEx started before the end of current one */ if (end < count && tmp[end] == 0xF0) { /* it's incomplete - drop it */ snd_rawmidi_transmit_ack(substream, end); continue; } /* SysEx complete */ if (end < count && tmp[end] == 0xF7) { /* queue it, ack it, and get the next one */ count = end + 1; msg[0] = 0x10 | count; memcpy(&msg[1], tmp, count); snd_rawmidi_transmit_ack(substream, count); urb->transfer_buffer_length += count + 1; msg += count + 1; continue; } /* less than 9 bytes and no end byte - wait for more */ if (count < MAX_AKAI_SYSEX_LEN) { ep->ports[0].active = 0; return; } /* 9 bytes and no end marker in sight - malformed, skip it */ snd_rawmidi_transmit_ack(substream, count); } } static const struct usb_protocol_ops snd_usbmidi_akai_ops = { .input = snd_usbmidi_akai_input, .output = snd_usbmidi_akai_output, }; /* * Novation USB MIDI protocol: number of data bytes is in the first byte * (when receiving) (+1!) or in the second byte (when sending); data begins * at the third byte. */ static void snd_usbmidi_novation_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { if (buffer_length < 2 || !buffer[0] || buffer_length < buffer[0] + 1) return; snd_usbmidi_input_data(ep, 0, &buffer[2], buffer[0] - 1); } static void snd_usbmidi_novation_output(struct snd_usb_midi_out_endpoint *ep, struct urb *urb) { uint8_t *transfer_buffer; int count; if (!ep->ports[0].active) return; transfer_buffer = urb->transfer_buffer; count = snd_rawmidi_transmit(ep->ports[0].substream, &transfer_buffer[2], ep->max_transfer - 2); if (count < 1) { ep->ports[0].active = 0; return; } transfer_buffer[0] = 0; transfer_buffer[1] = count; urb->transfer_buffer_length = 2 + count; } static const struct usb_protocol_ops snd_usbmidi_novation_ops = { .input = snd_usbmidi_novation_input, .output = snd_usbmidi_novation_output, }; /* * "raw" protocol: just move raw MIDI bytes from/to the endpoint */ static void snd_usbmidi_raw_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { snd_usbmidi_input_data(ep, 0, buffer, buffer_length); } static void snd_usbmidi_raw_output(struct snd_usb_midi_out_endpoint *ep, struct urb *urb) { int count; if (!ep->ports[0].active) return; count = snd_rawmidi_transmit(ep->ports[0].substream, urb->transfer_buffer, ep->max_transfer); if (count < 1) { ep->ports[0].active = 0; return; } urb->transfer_buffer_length = count; } static const struct usb_protocol_ops snd_usbmidi_raw_ops = { .input = snd_usbmidi_raw_input, .output = snd_usbmidi_raw_output, }; /* * FTDI protocol: raw MIDI bytes, but input packets have two modem status bytes. */ static void snd_usbmidi_ftdi_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { if (buffer_length > 2) snd_usbmidi_input_data(ep, 0, buffer + 2, buffer_length - 2); } static const struct usb_protocol_ops snd_usbmidi_ftdi_ops = { .input = snd_usbmidi_ftdi_input, .output = snd_usbmidi_raw_output, }; static void snd_usbmidi_us122l_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { if (buffer_length != 9) return; buffer_length = 8; while (buffer_length && buffer[buffer_length - 1] == 0xFD) buffer_length--; if (buffer_length) snd_usbmidi_input_data(ep, 0, buffer, buffer_length); } static void snd_usbmidi_us122l_output(struct snd_usb_midi_out_endpoint *ep, struct urb *urb) { int count; if (!ep->ports[0].active) return; switch (snd_usb_get_speed(ep->umidi->dev)) { case USB_SPEED_HIGH: case USB_SPEED_SUPER: case USB_SPEED_SUPER_PLUS: count = 1; break; default: count = 2; } count = snd_rawmidi_transmit(ep->ports[0].substream, urb->transfer_buffer, count); if (count < 1) { ep->ports[0].active = 0; return; } memset(urb->transfer_buffer + count, 0xFD, ep->max_transfer - count); urb->transfer_buffer_length = ep->max_transfer; } static const struct usb_protocol_ops snd_usbmidi_122l_ops = { .input = snd_usbmidi_us122l_input, .output = snd_usbmidi_us122l_output, }; /* * Emagic USB MIDI protocol: raw MIDI with "F5 xx" port switching. */ static void snd_usbmidi_emagic_init_out(struct snd_usb_midi_out_endpoint *ep) { static const u8 init_data[] = { /* initialization magic: "get version" */ 0xf0, 0x00, 0x20, 0x31, /* Emagic */ 0x64, /* Unitor8 */ 0x0b, /* version number request */ 0x00, /* command version */ 0x00, /* EEPROM, box 0 */ 0xf7 }; send_bulk_static_data(ep, init_data, sizeof(init_data)); /* while we're at it, pour on more magic */ send_bulk_static_data(ep, init_data, sizeof(init_data)); } static void snd_usbmidi_emagic_finish_out(struct snd_usb_midi_out_endpoint *ep) { static const u8 finish_data[] = { /* switch to patch mode with last preset */ 0xf0, 0x00, 0x20, 0x31, /* Emagic */ 0x64, /* Unitor8 */ 0x10, /* patch switch command */ 0x00, /* command version */ 0x7f, /* to all boxes */ 0x40, /* last preset in EEPROM */ 0xf7 }; send_bulk_static_data(ep, finish_data, sizeof(finish_data)); } static void snd_usbmidi_emagic_input(struct snd_usb_midi_in_endpoint *ep, uint8_t *buffer, int buffer_length) { int i; /* FF indicates end of valid data */ for (i = 0; i < buffer_length; ++i) if (buffer[i] == 0xff) { buffer_length = i; break; } /* handle F5 at end of last buffer */ if (ep->seen_f5) goto switch_port; while (buffer_length > 0) { /* determine size of data until next F5 */ for (i = 0; i < buffer_length; ++i) if (buffer[i] == 0xf5) break; snd_usbmidi_input_data(ep, ep->current_port, buffer, i); buffer += i; buffer_length -= i; if (buffer_length <= 0) break; /* assert(buffer[0] == 0xf5); */ ep->seen_f5 = 1; ++buffer; --buffer_length; switch_port: if (buffer_length <= 0) break; if (buffer[0] < 0x80) { ep->current_port = (buffer[0] - 1) & 15; ++buffer; --buffer_length; } ep->seen_f5 = 0; } } static void snd_usbmidi_emagic_output(struct snd_usb_midi_out_endpoint *ep, struct urb *urb) { int port0 = ep->current_port; uint8_t *buf = urb->transfer_buffer; int buf_free = ep->max_transfer; int length, i; for (i = 0; i < 0x10; ++i) { /* round-robin, starting at the last current port */ int portnum = (port0 + i) & 15; struct usbmidi_out_port *port = &ep->ports[portnum]; if (!port->active) continue; if (snd_rawmidi_transmit_peek(port->substream, buf, 1) != 1) { port->active = 0; continue; } if (portnum != ep->current_port) { if (buf_free < 2) break; ep->current_port = portnum; buf[0] = 0xf5; buf[1] = (portnum + 1) & 15; buf += 2; buf_free -= 2; } if (buf_free < 1) break; length = snd_rawmidi_transmit(port->substream, buf, buf_free); if (length > 0) { buf += length; buf_free -= length; if (buf_free < 1) break; } } if (buf_free < ep->max_transfer && buf_free > 0) { *buf = 0xff; --buf_free; } urb->transfer_buffer_length = ep->max_transfer - buf_free; } static const struct usb_protocol_ops snd_usbmidi_emagic_ops = { .input = snd_usbmidi_emagic_input, .output = snd_usbmidi_emagic_output, .init_out_endpoint = snd_usbmidi_emagic_init_out, .finish_out_endpoint = snd_usbmidi_emagic_finish_out, }; static void update_roland_altsetting(struct snd_usb_midi *umidi) { struct usb_interface *intf; struct usb_host_interface *hostif; struct usb_interface_descriptor *intfd; int is_light_load; intf = umidi->iface; is_light_load = intf->cur_altsetting != intf->altsetting; if (umidi->roland_load_ctl->private_value == is_light_load) return; hostif = &intf->altsetting[umidi->roland_load_ctl->private_value]; intfd = get_iface_desc(hostif); snd_usbmidi_input_stop(&umidi->list); usb_set_interface(umidi->dev, intfd->bInterfaceNumber, intfd->bAlternateSetting); snd_usbmidi_input_start(&umidi->list); } static int substream_open(struct snd_rawmidi_substream *substream, int dir, int open) { struct snd_usb_midi *umidi = substream->rmidi->private_data; struct snd_kcontrol *ctl; down_read(&umidi->disc_rwsem); if (umidi->disconnected) { up_read(&umidi->disc_rwsem); return open ? -ENODEV : 0; } mutex_lock(&umidi->mutex); if (open) { if (!umidi->opened[0] && !umidi->opened[1]) { if (umidi->roland_load_ctl) { ctl = umidi->roland_load_ctl; ctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; snd_ctl_notify(umidi->card, SNDRV_CTL_EVENT_MASK_INFO, &ctl->id); update_roland_altsetting(umidi); } } umidi->opened[dir]++; if (umidi->opened[1]) snd_usbmidi_input_start(&umidi->list); } else { umidi->opened[dir]--; if (!umidi->opened[1]) snd_usbmidi_input_stop(&umidi->list); if (!umidi->opened[0] && !umidi->opened[1]) { if (umidi->roland_load_ctl) { ctl = umidi->roland_load_ctl; ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; snd_ctl_notify(umidi->card, SNDRV_CTL_EVENT_MASK_INFO, &ctl->id); } } } mutex_unlock(&umidi->mutex); up_read(&umidi->disc_rwsem); return 0; } static int snd_usbmidi_output_open(struct snd_rawmidi_substream *substream) { struct snd_usb_midi *umidi = substream->rmidi->private_data; struct usbmidi_out_port *port = NULL; int i, j; for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) if (umidi->endpoints[i].out) for (j = 0; j < 0x10; ++j) if (umidi->endpoints[i].out->ports[j].substream == substream) { port = &umidi->endpoints[i].out->ports[j]; break; } if (!port) return -ENXIO; substream->runtime->private_data = port; port->state = STATE_UNKNOWN; return substream_open(substream, 0, 1); } static int snd_usbmidi_output_close(struct snd_rawmidi_substream *substream) { struct usbmidi_out_port *port = substream->runtime->private_data; cancel_work_sync(&port->ep->work); return substream_open(substream, 0, 0); } static void snd_usbmidi_output_trigger(struct snd_rawmidi_substream *substream, int up) { struct usbmidi_out_port *port = (struct usbmidi_out_port *)substream->runtime->private_data; port->active = up; if (up) { if (port->ep->umidi->disconnected) { /* gobble up remaining bytes to prevent wait in * snd_rawmidi_drain_output */ snd_rawmidi_proceed(substream); return; } queue_work(system_highpri_wq, &port->ep->work); } } static void snd_usbmidi_output_drain(struct snd_rawmidi_substream *substream) { struct usbmidi_out_port *port = substream->runtime->private_data; struct snd_usb_midi_out_endpoint *ep = port->ep; unsigned int drain_urbs; DEFINE_WAIT(wait); long timeout = msecs_to_jiffies(50); if (ep->umidi->disconnected) return; /* * The substream buffer is empty, but some data might still be in the * currently active URBs, so we have to wait for those to complete. */ spin_lock_irq(&ep->buffer_lock); drain_urbs = ep->active_urbs; if (drain_urbs) { ep->drain_urbs |= drain_urbs; do { prepare_to_wait(&ep->drain_wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&ep->buffer_lock); timeout = schedule_timeout(timeout); spin_lock_irq(&ep->buffer_lock); drain_urbs &= ep->drain_urbs; } while (drain_urbs && timeout); finish_wait(&ep->drain_wait, &wait); } port->active = 0; spin_unlock_irq(&ep->buffer_lock); } static int snd_usbmidi_input_open(struct snd_rawmidi_substream *substream) { return substream_open(substream, 1, 1); } static int snd_usbmidi_input_close(struct snd_rawmidi_substream *substream) { return substream_open(substream, 1, 0); } static void snd_usbmidi_input_trigger(struct snd_rawmidi_substream *substream, int up) { struct snd_usb_midi *umidi = substream->rmidi->private_data; if (up) set_bit(substream->number, &umidi->input_triggered); else clear_bit(substream->number, &umidi->input_triggered); } static const struct snd_rawmidi_ops snd_usbmidi_output_ops = { .open = snd_usbmidi_output_open, .close = snd_usbmidi_output_close, .trigger = snd_usbmidi_output_trigger, .drain = snd_usbmidi_output_drain, }; static const struct snd_rawmidi_ops snd_usbmidi_input_ops = { .open = snd_usbmidi_input_open, .close = snd_usbmidi_input_close, .trigger = snd_usbmidi_input_trigger }; static void free_urb_and_buffer(struct snd_usb_midi *umidi, struct urb *urb, unsigned int buffer_length) { usb_free_coherent(umidi->dev, buffer_length, urb->transfer_buffer, urb->transfer_dma); usb_free_urb(urb); } /* * Frees an input endpoint. * May be called when ep hasn't been initialized completely. */ static void snd_usbmidi_in_endpoint_delete(struct snd_usb_midi_in_endpoint *ep) { unsigned int i; for (i = 0; i < INPUT_URBS; ++i) if (ep->urbs[i]) free_urb_and_buffer(ep->umidi, ep->urbs[i], ep->urbs[i]->transfer_buffer_length); kfree(ep); } /* * Creates an input endpoint. */ static int snd_usbmidi_in_endpoint_create(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *ep_info, struct snd_usb_midi_endpoint *rep) { struct snd_usb_midi_in_endpoint *ep; void *buffer; unsigned int pipe; int length; unsigned int i; int err; rep->in = NULL; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (!ep) return -ENOMEM; ep->umidi = umidi; for (i = 0; i < INPUT_URBS; ++i) { ep->urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!ep->urbs[i]) { err = -ENOMEM; goto error; } } if (ep_info->in_interval) pipe = usb_rcvintpipe(umidi->dev, ep_info->in_ep); else pipe = usb_rcvbulkpipe(umidi->dev, ep_info->in_ep); length = usb_maxpacket(umidi->dev, pipe); for (i = 0; i < INPUT_URBS; ++i) { buffer = usb_alloc_coherent(umidi->dev, length, GFP_KERNEL, &ep->urbs[i]->transfer_dma); if (!buffer) { err = -ENOMEM; goto error; } if (ep_info->in_interval) usb_fill_int_urb(ep->urbs[i], umidi->dev, pipe, buffer, length, snd_usbmidi_in_urb_complete, ep, ep_info->in_interval); else usb_fill_bulk_urb(ep->urbs[i], umidi->dev, pipe, buffer, length, snd_usbmidi_in_urb_complete, ep); ep->urbs[i]->transfer_flags = URB_NO_TRANSFER_DMA_MAP; err = usb_urb_ep_type_check(ep->urbs[i]); if (err < 0) { dev_err(&umidi->dev->dev, "invalid MIDI in EP %x\n", ep_info->in_ep); goto error; } } rep->in = ep; return 0; error: snd_usbmidi_in_endpoint_delete(ep); return err; } /* * Frees an output endpoint. * May be called when ep hasn't been initialized completely. */ static void snd_usbmidi_out_endpoint_clear(struct snd_usb_midi_out_endpoint *ep) { unsigned int i; for (i = 0; i < OUTPUT_URBS; ++i) if (ep->urbs[i].urb) { free_urb_and_buffer(ep->umidi, ep->urbs[i].urb, ep->max_transfer); ep->urbs[i].urb = NULL; } } static void snd_usbmidi_out_endpoint_delete(struct snd_usb_midi_out_endpoint *ep) { snd_usbmidi_out_endpoint_clear(ep); kfree(ep); } /* * Creates an output endpoint, and initializes output ports. */ static int snd_usbmidi_out_endpoint_create(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *ep_info, struct snd_usb_midi_endpoint *rep) { struct snd_usb_midi_out_endpoint *ep; unsigned int i; unsigned int pipe; void *buffer; int err; rep->out = NULL; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (!ep) return -ENOMEM; ep->umidi = umidi; for (i = 0; i < OUTPUT_URBS; ++i) { ep->urbs[i].urb = usb_alloc_urb(0, GFP_KERNEL); if (!ep->urbs[i].urb) { err = -ENOMEM; goto error; } ep->urbs[i].ep = ep; } if (ep_info->out_interval) pipe = usb_sndintpipe(umidi->dev, ep_info->out_ep); else pipe = usb_sndbulkpipe(umidi->dev, ep_info->out_ep); switch (umidi->usb_id) { default: ep->max_transfer = usb_maxpacket(umidi->dev, pipe); break; /* * Various chips declare a packet size larger than 4 bytes, but * do not actually work with larger packets: */ case USB_ID(0x0a67, 0x5011): /* Medeli DD305 */ case USB_ID(0x0a92, 0x1020): /* ESI M4U */ case USB_ID(0x1430, 0x474b): /* RedOctane GH MIDI INTERFACE */ case USB_ID(0x15ca, 0x0101): /* Textech USB Midi Cable */ case USB_ID(0x15ca, 0x1806): /* Textech USB Midi Cable */ case USB_ID(0x1a86, 0x752d): /* QinHeng CH345 "USB2.0-MIDI" */ case USB_ID(0xfc08, 0x0101): /* Unknown vendor Cable */ ep->max_transfer = 4; break; /* * Some devices only work with 9 bytes packet size: */ case USB_ID(0x0644, 0x800e): /* Tascam US-122L */ case USB_ID(0x0644, 0x800f): /* Tascam US-144 */ ep->max_transfer = 9; break; } for (i = 0; i < OUTPUT_URBS; ++i) { buffer = usb_alloc_coherent(umidi->dev, ep->max_transfer, GFP_KERNEL, &ep->urbs[i].urb->transfer_dma); if (!buffer) { err = -ENOMEM; goto error; } if (ep_info->out_interval) usb_fill_int_urb(ep->urbs[i].urb, umidi->dev, pipe, buffer, ep->max_transfer, snd_usbmidi_out_urb_complete, &ep->urbs[i], ep_info->out_interval); else usb_fill_bulk_urb(ep->urbs[i].urb, umidi->dev, pipe, buffer, ep->max_transfer, snd_usbmidi_out_urb_complete, &ep->urbs[i]); err = usb_urb_ep_type_check(ep->urbs[i].urb); if (err < 0) { dev_err(&umidi->dev->dev, "invalid MIDI out EP %x\n", ep_info->out_ep); goto error; } ep->urbs[i].urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; } spin_lock_init(&ep->buffer_lock); INIT_WORK(&ep->work, snd_usbmidi_out_work); init_waitqueue_head(&ep->drain_wait); for (i = 0; i < 0x10; ++i) if (ep_info->out_cables & (1 << i)) { ep->ports[i].ep = ep; ep->ports[i].cable = i << 4; } if (umidi->usb_protocol_ops->init_out_endpoint) umidi->usb_protocol_ops->init_out_endpoint(ep); rep->out = ep; return 0; error: snd_usbmidi_out_endpoint_delete(ep); return err; } /* * Frees everything. */ static void snd_usbmidi_free(struct snd_usb_midi *umidi) { int i; for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { struct snd_usb_midi_endpoint *ep = &umidi->endpoints[i]; if (ep->out) snd_usbmidi_out_endpoint_delete(ep->out); if (ep->in) snd_usbmidi_in_endpoint_delete(ep->in); } mutex_destroy(&umidi->mutex); kfree(umidi); } /* * Unlinks all URBs (must be done before the usb_device is deleted). */ void snd_usbmidi_disconnect(struct list_head *p) { struct snd_usb_midi *umidi; unsigned int i, j; umidi = list_entry(p, struct snd_usb_midi, list); /* * an URB's completion handler may start the timer and * a timer may submit an URB. To reliably break the cycle * a flag under lock must be used */ down_write(&umidi->disc_rwsem); spin_lock_irq(&umidi->disc_lock); umidi->disconnected = 1; spin_unlock_irq(&umidi->disc_lock); up_write(&umidi->disc_rwsem); del_timer_sync(&umidi->error_timer); for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { struct snd_usb_midi_endpoint *ep = &umidi->endpoints[i]; if (ep->out) cancel_work_sync(&ep->out->work); if (ep->out) { for (j = 0; j < OUTPUT_URBS; ++j) usb_kill_urb(ep->out->urbs[j].urb); if (umidi->usb_protocol_ops->finish_out_endpoint) umidi->usb_protocol_ops->finish_out_endpoint(ep->out); ep->out->active_urbs = 0; if (ep->out->drain_urbs) { ep->out->drain_urbs = 0; wake_up(&ep->out->drain_wait); } } if (ep->in) for (j = 0; j < INPUT_URBS; ++j) usb_kill_urb(ep->in->urbs[j]); /* free endpoints here; later call can result in Oops */ if (ep->out) snd_usbmidi_out_endpoint_clear(ep->out); if (ep->in) { snd_usbmidi_in_endpoint_delete(ep->in); ep->in = NULL; } } } EXPORT_SYMBOL(snd_usbmidi_disconnect); static void snd_usbmidi_rawmidi_free(struct snd_rawmidi *rmidi) { struct snd_usb_midi *umidi = rmidi->private_data; snd_usbmidi_free(umidi); } static struct snd_rawmidi_substream *snd_usbmidi_find_substream(struct snd_usb_midi *umidi, int stream, int number) { struct snd_rawmidi_substream *substream; list_for_each_entry(substream, &umidi->rmidi->streams[stream].substreams, list) { if (substream->number == number) return substream; } return NULL; } /* * This list specifies names for ports that do not fit into the standard * "(product) MIDI (n)" schema because they aren't external MIDI ports, * such as internal control or synthesizer ports. */ static struct port_info { u32 id; short int port; short int voices; const char *name; unsigned int seq_flags; } snd_usbmidi_port_info[] = { #define PORT_INFO(vendor, product, num, name_, voices_, flags) \ { .id = USB_ID(vendor, product), \ .port = num, .voices = voices_, \ .name = name_, .seq_flags = flags } #define EXTERNAL_PORT(vendor, product, num, name) \ PORT_INFO(vendor, product, num, name, 0, \ SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | \ SNDRV_SEQ_PORT_TYPE_HARDWARE | \ SNDRV_SEQ_PORT_TYPE_PORT) #define CONTROL_PORT(vendor, product, num, name) \ PORT_INFO(vendor, product, num, name, 0, \ SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | \ SNDRV_SEQ_PORT_TYPE_HARDWARE) #define GM_SYNTH_PORT(vendor, product, num, name, voices) \ PORT_INFO(vendor, product, num, name, voices, \ SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | \ SNDRV_SEQ_PORT_TYPE_MIDI_GM | \ SNDRV_SEQ_PORT_TYPE_HARDWARE | \ SNDRV_SEQ_PORT_TYPE_SYNTHESIZER) #define ROLAND_SYNTH_PORT(vendor, product, num, name, voices) \ PORT_INFO(vendor, product, num, name, voices, \ SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | \ SNDRV_SEQ_PORT_TYPE_MIDI_GM | \ SNDRV_SEQ_PORT_TYPE_MIDI_GM2 | \ SNDRV_SEQ_PORT_TYPE_MIDI_GS | \ SNDRV_SEQ_PORT_TYPE_MIDI_XG | \ SNDRV_SEQ_PORT_TYPE_HARDWARE | \ SNDRV_SEQ_PORT_TYPE_SYNTHESIZER) #define SOUNDCANVAS_PORT(vendor, product, num, name, voices) \ PORT_INFO(vendor, product, num, name, voices, \ SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | \ SNDRV_SEQ_PORT_TYPE_MIDI_GM | \ SNDRV_SEQ_PORT_TYPE_MIDI_GM2 | \ SNDRV_SEQ_PORT_TYPE_MIDI_GS | \ SNDRV_SEQ_PORT_TYPE_MIDI_XG | \ SNDRV_SEQ_PORT_TYPE_MIDI_MT32 | \ SNDRV_SEQ_PORT_TYPE_HARDWARE | \ SNDRV_SEQ_PORT_TYPE_SYNTHESIZER) /* Yamaha MOTIF XF */ GM_SYNTH_PORT(0x0499, 0x105c, 0, "%s Tone Generator", 128), CONTROL_PORT(0x0499, 0x105c, 1, "%s Remote Control"), EXTERNAL_PORT(0x0499, 0x105c, 2, "%s Thru"), CONTROL_PORT(0x0499, 0x105c, 3, "%s Editor"), /* Roland UA-100 */ CONTROL_PORT(0x0582, 0x0000, 2, "%s Control"), /* Roland SC-8850 */ SOUNDCANVAS_PORT(0x0582, 0x0003, 0, "%s Part A", 128), SOUNDCANVAS_PORT(0x0582, 0x0003, 1, "%s Part B", 128), SOUNDCANVAS_PORT(0x0582, 0x0003, 2, "%s Part C", 128), SOUNDCANVAS_PORT(0x0582, 0x0003, 3, "%s Part D", 128), EXTERNAL_PORT(0x0582, 0x0003, 4, "%s MIDI 1"), EXTERNAL_PORT(0x0582, 0x0003, 5, "%s MIDI 2"), /* Roland U-8 */ EXTERNAL_PORT(0x0582, 0x0004, 0, "%s MIDI"), CONTROL_PORT(0x0582, 0x0004, 1, "%s Control"), /* Roland SC-8820 */ SOUNDCANVAS_PORT(0x0582, 0x0007, 0, "%s Part A", 64), SOUNDCANVAS_PORT(0x0582, 0x0007, 1, "%s Part B", 64), EXTERNAL_PORT(0x0582, 0x0007, 2, "%s MIDI"), /* Roland SK-500 */ SOUNDCANVAS_PORT(0x0582, 0x000b, 0, "%s Part A", 64), SOUNDCANVAS_PORT(0x0582, 0x000b, 1, "%s Part B", 64), EXTERNAL_PORT(0x0582, 0x000b, 2, "%s MIDI"), /* Roland SC-D70 */ SOUNDCANVAS_PORT(0x0582, 0x000c, 0, "%s Part A", 64), SOUNDCANVAS_PORT(0x0582, 0x000c, 1, "%s Part B", 64), EXTERNAL_PORT(0x0582, 0x000c, 2, "%s MIDI"), /* Edirol UM-880 */ CONTROL_PORT(0x0582, 0x0014, 8, "%s Control"), /* Edirol SD-90 */ ROLAND_SYNTH_PORT(0x0582, 0x0016, 0, "%s Part A", 128), ROLAND_SYNTH_PORT(0x0582, 0x0016, 1, "%s Part B", 128), EXTERNAL_PORT(0x0582, 0x0016, 2, "%s MIDI 1"), EXTERNAL_PORT(0x0582, 0x0016, 3, "%s MIDI 2"), /* Edirol UM-550 */ CONTROL_PORT(0x0582, 0x0023, 5, "%s Control"), /* Edirol SD-20 */ ROLAND_SYNTH_PORT(0x0582, 0x0027, 0, "%s Part A", 64), ROLAND_SYNTH_PORT(0x0582, 0x0027, 1, "%s Part B", 64), EXTERNAL_PORT(0x0582, 0x0027, 2, "%s MIDI"), /* Edirol SD-80 */ ROLAND_SYNTH_PORT(0x0582, 0x0029, 0, "%s Part A", 128), ROLAND_SYNTH_PORT(0x0582, 0x0029, 1, "%s Part B", 128), EXTERNAL_PORT(0x0582, 0x0029, 2, "%s MIDI 1"), EXTERNAL_PORT(0x0582, 0x0029, 3, "%s MIDI 2"), /* Edirol UA-700 */ EXTERNAL_PORT(0x0582, 0x002b, 0, "%s MIDI"), CONTROL_PORT(0x0582, 0x002b, 1, "%s Control"), /* Roland VariOS */ EXTERNAL_PORT(0x0582, 0x002f, 0, "%s MIDI"), EXTERNAL_PORT(0x0582, 0x002f, 1, "%s External MIDI"), EXTERNAL_PORT(0x0582, 0x002f, 2, "%s Sync"), /* Edirol PCR */ EXTERNAL_PORT(0x0582, 0x0033, 0, "%s MIDI"), EXTERNAL_PORT(0x0582, 0x0033, 1, "%s 1"), EXTERNAL_PORT(0x0582, 0x0033, 2, "%s 2"), /* BOSS GS-10 */ EXTERNAL_PORT(0x0582, 0x003b, 0, "%s MIDI"), CONTROL_PORT(0x0582, 0x003b, 1, "%s Control"), /* Edirol UA-1000 */ EXTERNAL_PORT(0x0582, 0x0044, 0, "%s MIDI"), CONTROL_PORT(0x0582, 0x0044, 1, "%s Control"), /* Edirol UR-80 */ EXTERNAL_PORT(0x0582, 0x0048, 0, "%s MIDI"), EXTERNAL_PORT(0x0582, 0x0048, 1, "%s 1"), EXTERNAL_PORT(0x0582, 0x0048, 2, "%s 2"), /* Edirol PCR-A */ EXTERNAL_PORT(0x0582, 0x004d, 0, "%s MIDI"), EXTERNAL_PORT(0x0582, 0x004d, 1, "%s 1"), EXTERNAL_PORT(0x0582, 0x004d, 2, "%s 2"), /* BOSS GT-PRO */ CONTROL_PORT(0x0582, 0x0089, 0, "%s Control"), /* Edirol UM-3EX */ CONTROL_PORT(0x0582, 0x009a, 3, "%s Control"), /* Roland VG-99 */ CONTROL_PORT(0x0582, 0x00b2, 0, "%s Control"), EXTERNAL_PORT(0x0582, 0x00b2, 1, "%s MIDI"), /* Cakewalk Sonar V-Studio 100 */ EXTERNAL_PORT(0x0582, 0x00eb, 0, "%s MIDI"), CONTROL_PORT(0x0582, 0x00eb, 1, "%s Control"), /* Roland VB-99 */ CONTROL_PORT(0x0582, 0x0102, 0, "%s Control"), EXTERNAL_PORT(0x0582, 0x0102, 1, "%s MIDI"), /* Roland A-PRO */ EXTERNAL_PORT(0x0582, 0x010f, 0, "%s MIDI"), CONTROL_PORT(0x0582, 0x010f, 1, "%s 1"), CONTROL_PORT(0x0582, 0x010f, 2, "%s 2"), /* Roland SD-50 */ ROLAND_SYNTH_PORT(0x0582, 0x0114, 0, "%s Synth", 128), EXTERNAL_PORT(0x0582, 0x0114, 1, "%s MIDI"), CONTROL_PORT(0x0582, 0x0114, 2, "%s Control"), /* Roland OCTA-CAPTURE */ EXTERNAL_PORT(0x0582, 0x0120, 0, "%s MIDI"), CONTROL_PORT(0x0582, 0x0120, 1, "%s Control"), EXTERNAL_PORT(0x0582, 0x0121, 0, "%s MIDI"), CONTROL_PORT(0x0582, 0x0121, 1, "%s Control"), /* Roland SPD-SX */ CONTROL_PORT(0x0582, 0x0145, 0, "%s Control"), EXTERNAL_PORT(0x0582, 0x0145, 1, "%s MIDI"), /* Roland A-Series */ CONTROL_PORT(0x0582, 0x0156, 0, "%s Keyboard"), EXTERNAL_PORT(0x0582, 0x0156, 1, "%s MIDI"), /* Roland INTEGRA-7 */ ROLAND_SYNTH_PORT(0x0582, 0x015b, 0, "%s Synth", 128), CONTROL_PORT(0x0582, 0x015b, 1, "%s Control"), /* M-Audio MidiSport 8x8 */ CONTROL_PORT(0x0763, 0x1031, 8, "%s Control"), CONTROL_PORT(0x0763, 0x1033, 8, "%s Control"), /* MOTU Fastlane */ EXTERNAL_PORT(0x07fd, 0x0001, 0, "%s MIDI A"), EXTERNAL_PORT(0x07fd, 0x0001, 1, "%s MIDI B"), /* Emagic Unitor8/AMT8/MT4 */ EXTERNAL_PORT(0x086a, 0x0001, 8, "%s Broadcast"), EXTERNAL_PORT(0x086a, 0x0002, 8, "%s Broadcast"), EXTERNAL_PORT(0x086a, 0x0003, 4, "%s Broadcast"), /* Akai MPD16 */ CONTROL_PORT(0x09e8, 0x0062, 0, "%s Control"), PORT_INFO(0x09e8, 0x0062, 1, "%s MIDI", 0, SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | SNDRV_SEQ_PORT_TYPE_HARDWARE), /* Access Music Virus TI */ EXTERNAL_PORT(0x133e, 0x0815, 0, "%s MIDI"), PORT_INFO(0x133e, 0x0815, 1, "%s Synth", 0, SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | SNDRV_SEQ_PORT_TYPE_HARDWARE | SNDRV_SEQ_PORT_TYPE_SYNTHESIZER), }; static struct port_info *find_port_info(struct snd_usb_midi *umidi, int number) { int i; for (i = 0; i < ARRAY_SIZE(snd_usbmidi_port_info); ++i) { if (snd_usbmidi_port_info[i].id == umidi->usb_id && snd_usbmidi_port_info[i].port == number) return &snd_usbmidi_port_info[i]; } return NULL; } static void snd_usbmidi_get_port_info(struct snd_rawmidi *rmidi, int number, struct snd_seq_port_info *seq_port_info) { struct snd_usb_midi *umidi = rmidi->private_data; struct port_info *port_info; /* TODO: read port flags from descriptors */ port_info = find_port_info(umidi, number); if (port_info) { seq_port_info->type = port_info->seq_flags; seq_port_info->midi_voices = port_info->voices; } } static struct usb_midi_in_jack_descriptor *find_usb_in_jack_descriptor( struct usb_host_interface *hostif, uint8_t jack_id) { unsigned char *extra = hostif->extra; int extralen = hostif->extralen; while (extralen > 4) { struct usb_midi_in_jack_descriptor *injd = (struct usb_midi_in_jack_descriptor *)extra; if (injd->bLength >= sizeof(*injd) && injd->bDescriptorType == USB_DT_CS_INTERFACE && injd->bDescriptorSubtype == UAC_MIDI_IN_JACK && injd->bJackID == jack_id) return injd; if (!extra[0]) break; extralen -= extra[0]; extra += extra[0]; } return NULL; } static struct usb_midi_out_jack_descriptor *find_usb_out_jack_descriptor( struct usb_host_interface *hostif, uint8_t jack_id) { unsigned char *extra = hostif->extra; int extralen = hostif->extralen; while (extralen > 4) { struct usb_midi_out_jack_descriptor *outjd = (struct usb_midi_out_jack_descriptor *)extra; if (outjd->bLength >= sizeof(*outjd) && outjd->bDescriptorType == USB_DT_CS_INTERFACE && outjd->bDescriptorSubtype == UAC_MIDI_OUT_JACK && outjd->bJackID == jack_id) return outjd; if (!extra[0]) break; extralen -= extra[0]; extra += extra[0]; } return NULL; } static void snd_usbmidi_init_substream(struct snd_usb_midi *umidi, int stream, int number, int jack_id, struct snd_rawmidi_substream **rsubstream) { struct port_info *port_info; const char *name_format; struct usb_interface *intf; struct usb_host_interface *hostif; struct usb_midi_in_jack_descriptor *injd; struct usb_midi_out_jack_descriptor *outjd; uint8_t jack_name_buf[32]; uint8_t *default_jack_name = "MIDI"; uint8_t *jack_name = default_jack_name; uint8_t iJack; size_t sz; int res; struct snd_rawmidi_substream *substream = snd_usbmidi_find_substream(umidi, stream, number); if (!substream) { dev_err(&umidi->dev->dev, "substream %d:%d not found\n", stream, number); return; } intf = umidi->iface; if (intf && jack_id >= 0) { hostif = intf->cur_altsetting; iJack = 0; if (stream != SNDRV_RAWMIDI_STREAM_OUTPUT) { /* in jacks connect to outs */ outjd = find_usb_out_jack_descriptor(hostif, jack_id); if (outjd) { sz = USB_DT_MIDI_OUT_SIZE(outjd->bNrInputPins); if (outjd->bLength >= sz) iJack = *(((uint8_t *) outjd) + sz - sizeof(uint8_t)); } } else { /* and out jacks connect to ins */ injd = find_usb_in_jack_descriptor(hostif, jack_id); if (injd) iJack = injd->iJack; } if (iJack != 0) { res = usb_string(umidi->dev, iJack, jack_name_buf, ARRAY_SIZE(jack_name_buf)); if (res) jack_name = jack_name_buf; } } port_info = find_port_info(umidi, number); name_format = port_info ? port_info->name : (jack_name != default_jack_name ? "%s %s" : "%s %s %d"); snprintf(substream->name, sizeof(substream->name), name_format, umidi->card->shortname, jack_name, number + 1); *rsubstream = substream; } /* * Creates the endpoints and their ports. */ static int snd_usbmidi_create_endpoints(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *endpoints) { int i, j, err; int out_ports = 0, in_ports = 0; for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { if (endpoints[i].out_cables) { err = snd_usbmidi_out_endpoint_create(umidi, &endpoints[i], &umidi->endpoints[i]); if (err < 0) return err; } if (endpoints[i].in_cables) { err = snd_usbmidi_in_endpoint_create(umidi, &endpoints[i], &umidi->endpoints[i]); if (err < 0) return err; } for (j = 0; j < 0x10; ++j) { if (endpoints[i].out_cables & (1 << j)) { snd_usbmidi_init_substream(umidi, SNDRV_RAWMIDI_STREAM_OUTPUT, out_ports, endpoints[i].assoc_out_jacks[j], &umidi->endpoints[i].out->ports[j].substream); ++out_ports; } if (endpoints[i].in_cables & (1 << j)) { snd_usbmidi_init_substream(umidi, SNDRV_RAWMIDI_STREAM_INPUT, in_ports, endpoints[i].assoc_in_jacks[j], &umidi->endpoints[i].in->ports[j].substream); ++in_ports; } } } dev_dbg(&umidi->dev->dev, "created %d output and %d input ports\n", out_ports, in_ports); return 0; } static struct usb_ms_endpoint_descriptor *find_usb_ms_endpoint_descriptor( struct usb_host_endpoint *hostep) { unsigned char *extra = hostep->extra; int extralen = hostep->extralen; while (extralen > 3) { struct usb_ms_endpoint_descriptor *ms_ep = (struct usb_ms_endpoint_descriptor *)extra; if (ms_ep->bLength > 3 && ms_ep->bDescriptorType == USB_DT_CS_ENDPOINT && ms_ep->bDescriptorSubtype == UAC_MS_GENERAL) return ms_ep; if (!extra[0]) break; extralen -= extra[0]; extra += extra[0]; } return NULL; } /* * Returns MIDIStreaming device capabilities. */ static int snd_usbmidi_get_ms_info(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *endpoints) { struct usb_interface *intf; struct usb_host_interface *hostif; struct usb_interface_descriptor *intfd; struct usb_ms_header_descriptor *ms_header; struct usb_host_endpoint *hostep; struct usb_endpoint_descriptor *ep; struct usb_ms_endpoint_descriptor *ms_ep; int i, j, epidx; intf = umidi->iface; if (!intf) return -ENXIO; hostif = &intf->altsetting[0]; intfd = get_iface_desc(hostif); ms_header = (struct usb_ms_header_descriptor *)hostif->extra; if (hostif->extralen >= 7 && ms_header->bLength >= 7 && ms_header->bDescriptorType == USB_DT_CS_INTERFACE && ms_header->bDescriptorSubtype == UAC_HEADER) dev_dbg(&umidi->dev->dev, "MIDIStreaming version %02x.%02x\n", ((uint8_t *)&ms_header->bcdMSC)[1], ((uint8_t *)&ms_header->bcdMSC)[0]); else dev_warn(&umidi->dev->dev, "MIDIStreaming interface descriptor not found\n"); epidx = 0; for (i = 0; i < intfd->bNumEndpoints; ++i) { hostep = &hostif->endpoint[i]; ep = get_ep_desc(hostep); if (!usb_endpoint_xfer_bulk(ep) && !usb_endpoint_xfer_int(ep)) continue; ms_ep = find_usb_ms_endpoint_descriptor(hostep); if (!ms_ep) continue; if (ms_ep->bLength <= sizeof(*ms_ep)) continue; if (ms_ep->bNumEmbMIDIJack > 0x10) continue; if (ms_ep->bLength < sizeof(*ms_ep) + ms_ep->bNumEmbMIDIJack) continue; if (usb_endpoint_dir_out(ep)) { if (endpoints[epidx].out_ep) { if (++epidx >= MIDI_MAX_ENDPOINTS) { dev_warn(&umidi->dev->dev, "too many endpoints\n"); break; } } endpoints[epidx].out_ep = usb_endpoint_num(ep); if (usb_endpoint_xfer_int(ep)) endpoints[epidx].out_interval = ep->bInterval; else if (snd_usb_get_speed(umidi->dev) == USB_SPEED_LOW) /* * Low speed bulk transfers don't exist, so * force interrupt transfers for devices like * ESI MIDI Mate that try to use them anyway. */ endpoints[epidx].out_interval = 1; endpoints[epidx].out_cables = (1 << ms_ep->bNumEmbMIDIJack) - 1; for (j = 0; j < ms_ep->bNumEmbMIDIJack; ++j) endpoints[epidx].assoc_out_jacks[j] = ms_ep->baAssocJackID[j]; for (; j < ARRAY_SIZE(endpoints[epidx].assoc_out_jacks); ++j) endpoints[epidx].assoc_out_jacks[j] = -1; dev_dbg(&umidi->dev->dev, "EP %02X: %d jack(s)\n", ep->bEndpointAddress, ms_ep->bNumEmbMIDIJack); } else { if (endpoints[epidx].in_ep) { if (++epidx >= MIDI_MAX_ENDPOINTS) { dev_warn(&umidi->dev->dev, "too many endpoints\n"); break; } } endpoints[epidx].in_ep = usb_endpoint_num(ep); if (usb_endpoint_xfer_int(ep)) endpoints[epidx].in_interval = ep->bInterval; else if (snd_usb_get_speed(umidi->dev) == USB_SPEED_LOW) endpoints[epidx].in_interval = 1; endpoints[epidx].in_cables = (1 << ms_ep->bNumEmbMIDIJack) - 1; for (j = 0; j < ms_ep->bNumEmbMIDIJack; ++j) endpoints[epidx].assoc_in_jacks[j] = ms_ep->baAssocJackID[j]; for (; j < ARRAY_SIZE(endpoints[epidx].assoc_in_jacks); ++j) endpoints[epidx].assoc_in_jacks[j] = -1; dev_dbg(&umidi->dev->dev, "EP %02X: %d jack(s)\n", ep->bEndpointAddress, ms_ep->bNumEmbMIDIJack); } } return 0; } static int roland_load_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *info) { static const char *const names[] = { "High Load", "Light Load" }; return snd_ctl_enum_info(info, 1, 2, names); } static int roland_load_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *value) { value->value.enumerated.item[0] = kcontrol->private_value; return 0; } static int roland_load_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *value) { struct snd_usb_midi *umidi = kcontrol->private_data; int changed; if (value->value.enumerated.item[0] > 1) return -EINVAL; mutex_lock(&umidi->mutex); changed = value->value.enumerated.item[0] != kcontrol->private_value; if (changed) kcontrol->private_value = value->value.enumerated.item[0]; mutex_unlock(&umidi->mutex); return changed; } static const struct snd_kcontrol_new roland_load_ctl = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "MIDI Input Mode", .info = roland_load_info, .get = roland_load_get, .put = roland_load_put, .private_value = 1, }; /* * On Roland devices, use the second alternate setting to be able to use * the interrupt input endpoint. */ static void snd_usbmidi_switch_roland_altsetting(struct snd_usb_midi *umidi) { struct usb_interface *intf; struct usb_host_interface *hostif; struct usb_interface_descriptor *intfd; intf = umidi->iface; if (!intf || intf->num_altsetting != 2) return; hostif = &intf->altsetting[1]; intfd = get_iface_desc(hostif); /* If either or both of the endpoints support interrupt transfer, * then use the alternate setting */ if (intfd->bNumEndpoints != 2 || !((get_endpoint(hostif, 0)->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_INT || (get_endpoint(hostif, 1)->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_INT)) return; dev_dbg(&umidi->dev->dev, "switching to altsetting %d with int ep\n", intfd->bAlternateSetting); usb_set_interface(umidi->dev, intfd->bInterfaceNumber, intfd->bAlternateSetting); umidi->roland_load_ctl = snd_ctl_new1(&roland_load_ctl, umidi); if (snd_ctl_add(umidi->card, umidi->roland_load_ctl) < 0) umidi->roland_load_ctl = NULL; } /* * Try to find any usable endpoints in the interface. */ static int snd_usbmidi_detect_endpoints(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *endpoint, int max_endpoints) { struct usb_interface *intf; struct usb_host_interface *hostif; struct usb_interface_descriptor *intfd; struct usb_endpoint_descriptor *epd; int i, out_eps = 0, in_eps = 0; if (USB_ID_VENDOR(umidi->usb_id) == 0x0582) snd_usbmidi_switch_roland_altsetting(umidi); if (endpoint[0].out_ep || endpoint[0].in_ep) return 0; intf = umidi->iface; if (!intf || intf->num_altsetting < 1) return -ENOENT; hostif = intf->cur_altsetting; intfd = get_iface_desc(hostif); for (i = 0; i < intfd->bNumEndpoints; ++i) { epd = get_endpoint(hostif, i); if (!usb_endpoint_xfer_bulk(epd) && !usb_endpoint_xfer_int(epd)) continue; if (out_eps < max_endpoints && usb_endpoint_dir_out(epd)) { endpoint[out_eps].out_ep = usb_endpoint_num(epd); if (usb_endpoint_xfer_int(epd)) endpoint[out_eps].out_interval = epd->bInterval; ++out_eps; } if (in_eps < max_endpoints && usb_endpoint_dir_in(epd)) { endpoint[in_eps].in_ep = usb_endpoint_num(epd); if (usb_endpoint_xfer_int(epd)) endpoint[in_eps].in_interval = epd->bInterval; ++in_eps; } } return (out_eps || in_eps) ? 0 : -ENOENT; } /* * Detects the endpoints for one-port-per-endpoint protocols. */ static int snd_usbmidi_detect_per_port_endpoints(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *endpoints) { int err, i; err = snd_usbmidi_detect_endpoints(umidi, endpoints, MIDI_MAX_ENDPOINTS); for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { if (endpoints[i].out_ep) endpoints[i].out_cables = 0x0001; if (endpoints[i].in_ep) endpoints[i].in_cables = 0x0001; } return err; } /* * Detects the endpoints and ports of Yamaha devices. */ static int snd_usbmidi_detect_yamaha(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *endpoint) { struct usb_interface *intf; struct usb_host_interface *hostif; struct usb_interface_descriptor *intfd; uint8_t *cs_desc; intf = umidi->iface; if (!intf) return -ENOENT; hostif = intf->altsetting; intfd = get_iface_desc(hostif); if (intfd->bNumEndpoints < 1) return -ENOENT; /* * For each port there is one MIDI_IN/OUT_JACK descriptor, not * necessarily with any useful contents. So simply count 'em. */ for (cs_desc = hostif->extra; cs_desc < hostif->extra + hostif->extralen && cs_desc[0] >= 2; cs_desc += cs_desc[0]) { if (cs_desc[1] == USB_DT_CS_INTERFACE) { if (cs_desc[2] == UAC_MIDI_IN_JACK) endpoint->in_cables = (endpoint->in_cables << 1) | 1; else if (cs_desc[2] == UAC_MIDI_OUT_JACK) endpoint->out_cables = (endpoint->out_cables << 1) | 1; } } if (!endpoint->in_cables && !endpoint->out_cables) return -ENOENT; return snd_usbmidi_detect_endpoints(umidi, endpoint, 1); } /* * Detects the endpoints and ports of Roland devices. */ static int snd_usbmidi_detect_roland(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *endpoint) { struct usb_interface *intf; struct usb_host_interface *hostif; u8 *cs_desc; intf = umidi->iface; if (!intf) return -ENOENT; hostif = intf->altsetting; /* * Some devices have a descriptor <06 24 F1 02 <inputs> <outputs>>, * some have standard class descriptors, or both kinds, or neither. */ for (cs_desc = hostif->extra; cs_desc < hostif->extra + hostif->extralen && cs_desc[0] >= 2; cs_desc += cs_desc[0]) { if (cs_desc[0] >= 6 && cs_desc[1] == USB_DT_CS_INTERFACE && cs_desc[2] == 0xf1 && cs_desc[3] == 0x02) { if (cs_desc[4] > 0x10 || cs_desc[5] > 0x10) continue; endpoint->in_cables = (1 << cs_desc[4]) - 1; endpoint->out_cables = (1 << cs_desc[5]) - 1; return snd_usbmidi_detect_endpoints(umidi, endpoint, 1); } else if (cs_desc[0] >= 7 && cs_desc[1] == USB_DT_CS_INTERFACE && cs_desc[2] == UAC_HEADER) { return snd_usbmidi_get_ms_info(umidi, endpoint); } } return -ENODEV; } /* * Creates the endpoints and their ports for Midiman devices. */ static int snd_usbmidi_create_endpoints_midiman(struct snd_usb_midi *umidi, struct snd_usb_midi_endpoint_info *endpoint) { struct snd_usb_midi_endpoint_info ep_info; struct usb_interface *intf; struct usb_host_interface *hostif; struct usb_interface_descriptor *intfd; struct usb_endpoint_descriptor *epd; int cable, err; intf = umidi->iface; if (!intf) return -ENOENT; hostif = intf->altsetting; intfd = get_iface_desc(hostif); /* * The various MidiSport devices have more or less random endpoint * numbers, so we have to identify the endpoints by their index in * the descriptor array, like the driver for that other OS does. * * There is one interrupt input endpoint for all input ports, one * bulk output endpoint for even-numbered ports, and one for odd- * numbered ports. Both bulk output endpoints have corresponding * input bulk endpoints (at indices 1 and 3) which aren't used. */ if (intfd->bNumEndpoints < (endpoint->out_cables > 0x0001 ? 5 : 3)) { dev_dbg(&umidi->dev->dev, "not enough endpoints\n"); return -ENOENT; } epd = get_endpoint(hostif, 0); if (!usb_endpoint_dir_in(epd) || !usb_endpoint_xfer_int(epd)) { dev_dbg(&umidi->dev->dev, "endpoint[0] isn't interrupt\n"); return -ENXIO; } epd = get_endpoint(hostif, 2); if (!usb_endpoint_dir_out(epd) || !usb_endpoint_xfer_bulk(epd)) { dev_dbg(&umidi->dev->dev, "endpoint[2] isn't bulk output\n"); return -ENXIO; } if (endpoint->out_cables > 0x0001) { epd = get_endpoint(hostif, 4); if (!usb_endpoint_dir_out(epd) || !usb_endpoint_xfer_bulk(epd)) { dev_dbg(&umidi->dev->dev, "endpoint[4] isn't bulk output\n"); return -ENXIO; } } ep_info.out_ep = get_endpoint(hostif, 2)->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK; ep_info.out_interval = 0; ep_info.out_cables = endpoint->out_cables & 0x5555; err = snd_usbmidi_out_endpoint_create(umidi, &ep_info, &umidi->endpoints[0]); if (err < 0) return err; ep_info.in_ep = get_endpoint(hostif, 0)->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK; ep_info.in_interval = get_endpoint(hostif, 0)->bInterval; ep_info.in_cables = endpoint->in_cables; err = snd_usbmidi_in_endpoint_create(umidi, &ep_info, &umidi->endpoints[0]); if (err < 0) return err; if (endpoint->out_cables > 0x0001) { ep_info.out_ep = get_endpoint(hostif, 4)->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK; ep_info.out_cables = endpoint->out_cables & 0xaaaa; err = snd_usbmidi_out_endpoint_create(umidi, &ep_info, &umidi->endpoints[1]); if (err < 0) return err; } for (cable = 0; cable < 0x10; ++cable) { if (endpoint->out_cables & (1 << cable)) snd_usbmidi_init_substream(umidi, SNDRV_RAWMIDI_STREAM_OUTPUT, cable, -1 /* prevent trying to find jack */, &umidi->endpoints[cable & 1].out->ports[cable].substream); if (endpoint->in_cables & (1 << cable)) snd_usbmidi_init_substream(umidi, SNDRV_RAWMIDI_STREAM_INPUT, cable, -1 /* prevent trying to find jack */, &umidi->endpoints[0].in->ports[cable].substream); } return 0; } static const struct snd_rawmidi_global_ops snd_usbmidi_ops = { .get_port_info = snd_usbmidi_get_port_info, }; static int snd_usbmidi_create_rawmidi(struct snd_usb_midi *umidi, int out_ports, int in_ports) { struct snd_rawmidi *rmidi; int err; err = snd_rawmidi_new(umidi->card, "USB MIDI", umidi->next_midi_device++, out_ports, in_ports, &rmidi); if (err < 0) return err; strcpy(rmidi->name, umidi->card->shortname); rmidi->info_flags = SNDRV_RAWMIDI_INFO_OUTPUT | SNDRV_RAWMIDI_INFO_INPUT | SNDRV_RAWMIDI_INFO_DUPLEX; rmidi->ops = &snd_usbmidi_ops; rmidi->private_data = umidi; rmidi->private_free = snd_usbmidi_rawmidi_free; snd_rawmidi_set_ops(rmidi, SNDRV_RAWMIDI_STREAM_OUTPUT, &snd_usbmidi_output_ops); snd_rawmidi_set_ops(rmidi, SNDRV_RAWMIDI_STREAM_INPUT, &snd_usbmidi_input_ops); umidi->rmidi = rmidi; return 0; } /* * Temporarily stop input. */ void snd_usbmidi_input_stop(struct list_head *p) { struct snd_usb_midi *umidi; unsigned int i, j; umidi = list_entry(p, struct snd_usb_midi, list); if (!umidi->input_running) return; for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { struct snd_usb_midi_endpoint *ep = &umidi->endpoints[i]; if (ep->in) for (j = 0; j < INPUT_URBS; ++j) usb_kill_urb(ep->in->urbs[j]); } umidi->input_running = 0; } EXPORT_SYMBOL(snd_usbmidi_input_stop); static void snd_usbmidi_input_start_ep(struct snd_usb_midi *umidi, struct snd_usb_midi_in_endpoint *ep) { unsigned int i; unsigned long flags; if (!ep) return; for (i = 0; i < INPUT_URBS; ++i) { struct urb *urb = ep->urbs[i]; spin_lock_irqsave(&umidi->disc_lock, flags); if (!atomic_read(&urb->use_count)) { urb->dev = ep->umidi->dev; snd_usbmidi_submit_urb(urb, GFP_ATOMIC); } spin_unlock_irqrestore(&umidi->disc_lock, flags); } } /* * Resume input after a call to snd_usbmidi_input_stop(). */ void snd_usbmidi_input_start(struct list_head *p) { struct snd_usb_midi *umidi; int i; umidi = list_entry(p, struct snd_usb_midi, list); if (umidi->input_running || !umidi->opened[1]) return; for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) snd_usbmidi_input_start_ep(umidi, umidi->endpoints[i].in); umidi->input_running = 1; } EXPORT_SYMBOL(snd_usbmidi_input_start); /* * Prepare for suspend. Typically called from the USB suspend callback. */ void snd_usbmidi_suspend(struct list_head *p) { struct snd_usb_midi *umidi; umidi = list_entry(p, struct snd_usb_midi, list); mutex_lock(&umidi->mutex); snd_usbmidi_input_stop(p); mutex_unlock(&umidi->mutex); } EXPORT_SYMBOL(snd_usbmidi_suspend); /* * Resume. Typically called from the USB resume callback. */ void snd_usbmidi_resume(struct list_head *p) { struct snd_usb_midi *umidi; umidi = list_entry(p, struct snd_usb_midi, list); mutex_lock(&umidi->mutex); snd_usbmidi_input_start(p); mutex_unlock(&umidi->mutex); } EXPORT_SYMBOL(snd_usbmidi_resume); /* * Creates and registers everything needed for a MIDI streaming interface. */ int __snd_usbmidi_create(struct snd_card *card, struct usb_interface *iface, struct list_head *midi_list, const struct snd_usb_audio_quirk *quirk, unsigned int usb_id, unsigned int *num_rawmidis) { struct snd_usb_midi *umidi; struct snd_usb_midi_endpoint_info endpoints[MIDI_MAX_ENDPOINTS]; int out_ports, in_ports; int i, err; umidi = kzalloc(sizeof(*umidi), GFP_KERNEL); if (!umidi) return -ENOMEM; umidi->dev = interface_to_usbdev(iface); umidi->card = card; umidi->iface = iface; umidi->quirk = quirk; umidi->usb_protocol_ops = &snd_usbmidi_standard_ops; if (num_rawmidis) umidi->next_midi_device = *num_rawmidis; spin_lock_init(&umidi->disc_lock); init_rwsem(&umidi->disc_rwsem); mutex_init(&umidi->mutex); if (!usb_id) usb_id = USB_ID(le16_to_cpu(umidi->dev->descriptor.idVendor), le16_to_cpu(umidi->dev->descriptor.idProduct)); umidi->usb_id = usb_id; timer_setup(&umidi->error_timer, snd_usbmidi_error_timer, 0); /* detect the endpoint(s) to use */ memset(endpoints, 0, sizeof(endpoints)); switch (quirk ? quirk->type : QUIRK_MIDI_STANDARD_INTERFACE) { case QUIRK_MIDI_STANDARD_INTERFACE: err = snd_usbmidi_get_ms_info(umidi, endpoints); if (umidi->usb_id == USB_ID(0x0763, 0x0150)) /* M-Audio Uno */ umidi->usb_protocol_ops = &snd_usbmidi_maudio_broken_running_status_ops; break; case QUIRK_MIDI_US122L: umidi->usb_protocol_ops = &snd_usbmidi_122l_ops; fallthrough; case QUIRK_MIDI_FIXED_ENDPOINT: memcpy(&endpoints[0], quirk->data, sizeof(struct snd_usb_midi_endpoint_info)); err = snd_usbmidi_detect_endpoints(umidi, &endpoints[0], 1); break; case QUIRK_MIDI_YAMAHA: err = snd_usbmidi_detect_yamaha(umidi, &endpoints[0]); break; case QUIRK_MIDI_ROLAND: err = snd_usbmidi_detect_roland(umidi, &endpoints[0]); break; case QUIRK_MIDI_MIDIMAN: umidi->usb_protocol_ops = &snd_usbmidi_midiman_ops; memcpy(&endpoints[0], quirk->data, sizeof(struct snd_usb_midi_endpoint_info)); err = 0; break; case QUIRK_MIDI_NOVATION: umidi->usb_protocol_ops = &snd_usbmidi_novation_ops; err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints); break; case QUIRK_MIDI_RAW_BYTES: umidi->usb_protocol_ops = &snd_usbmidi_raw_ops; /* * Interface 1 contains isochronous endpoints, but with the same * numbers as in interface 0. Since it is interface 1 that the * USB core has most recently seen, these descriptors are now * associated with the endpoint numbers. This will foul up our * attempts to submit bulk/interrupt URBs to the endpoints in * interface 0, so we have to make sure that the USB core looks * again at interface 0 by calling usb_set_interface() on it. */ if (umidi->usb_id == USB_ID(0x07fd, 0x0001)) /* MOTU Fastlane */ usb_set_interface(umidi->dev, 0, 0); err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints); break; case QUIRK_MIDI_EMAGIC: umidi->usb_protocol_ops = &snd_usbmidi_emagic_ops; memcpy(&endpoints[0], quirk->data, sizeof(struct snd_usb_midi_endpoint_info)); err = snd_usbmidi_detect_endpoints(umidi, &endpoints[0], 1); break; case QUIRK_MIDI_CME: umidi->usb_protocol_ops = &snd_usbmidi_cme_ops; err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints); break; case QUIRK_MIDI_AKAI: umidi->usb_protocol_ops = &snd_usbmidi_akai_ops; err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints); /* endpoint 1 is input-only */ endpoints[1].out_cables = 0; break; case QUIRK_MIDI_FTDI: umidi->usb_protocol_ops = &snd_usbmidi_ftdi_ops; /* set baud rate to 31250 (48 MHz / 16 / 96) */ err = usb_control_msg(umidi->dev, usb_sndctrlpipe(umidi->dev, 0), 3, 0x40, 0x60, 0, NULL, 0, 1000); if (err < 0) break; err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints); break; case QUIRK_MIDI_CH345: umidi->usb_protocol_ops = &snd_usbmidi_ch345_broken_sysex_ops; err = snd_usbmidi_detect_per_port_endpoints(umidi, endpoints); break; default: dev_err(&umidi->dev->dev, "invalid quirk type %d\n", quirk->type); err = -ENXIO; break; } if (err < 0) goto free_midi; /* create rawmidi device */ out_ports = 0; in_ports = 0; for (i = 0; i < MIDI_MAX_ENDPOINTS; ++i) { out_ports += hweight16(endpoints[i].out_cables); in_ports += hweight16(endpoints[i].in_cables); } err = snd_usbmidi_create_rawmidi(umidi, out_ports, in_ports); if (err < 0) goto free_midi; /* create endpoint/port structures */ if (quirk && quirk->type == QUIRK_MIDI_MIDIMAN) err = snd_usbmidi_create_endpoints_midiman(umidi, &endpoints[0]); else err = snd_usbmidi_create_endpoints(umidi, endpoints); if (err < 0) goto exit; usb_autopm_get_interface_no_resume(umidi->iface); list_add_tail(&umidi->list, midi_list); if (num_rawmidis) *num_rawmidis = umidi->next_midi_device; return 0; free_midi: kfree(umidi); exit: return err; } EXPORT_SYMBOL(__snd_usbmidi_create); |
| 1492 1495 1497 1490 1495 1496 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | // SPDX-License-Identifier: GPL-2.0 #include <linux/fault-inject.h> #include <linux/mm.h> static struct { struct fault_attr attr; bool ignore_gfp_highmem; bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; static int __init setup_fail_page_alloc(char *str) { return setup_fault_attr(&fail_page_alloc.attr, str); } __setup("fail_page_alloc=", setup_fail_page_alloc); bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { int flags = 0; if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; if (fail_page_alloc.ignore_gfp_reclaim && (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; /* See comment in __should_failslab() */ if (gfp_mask & __GFP_NOWARN) flags |= FAULT_NOWARN; return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_page_alloc_debugfs(void) { umode_t mode = S_IFREG | 0600; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); debugfs_create_bool("ignore-gfp-wait", mode, dir, &fail_page_alloc.ignore_gfp_reclaim); debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem); debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); return 0; } late_initcall(fail_page_alloc_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
| 118 111 111 111 118 118 118 118 118 115 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 | // SPDX-License-Identifier: GPL-2.0 #include <linux/quotaops.h> #include <linux/uuid.h> #include "ext4.h" #include "xattr.h" #include "ext4_jbd2.h" static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, const struct fscrypt_name *src) { memset(dst, 0, sizeof(*dst)); dst->usr_fname = src->usr_fname; dst->disk_name = src->disk_name; dst->hinfo.hash = src->hash; dst->hinfo.minor_hash = src->minor_hash; dst->crypto_buf = src->crypto_buf; } int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { struct fscrypt_name name; int err; err = fscrypt_setup_filename(dir, iname, lookup, &name); if (err) return err; ext4_fname_from_fscrypt_name(fname, &name); #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); if (err) ext4_fname_free_filename(fname); #endif return err; } int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname) { struct fscrypt_name name; int err; err = fscrypt_prepare_lookup(dir, dentry, &name); if (err) return err; ext4_fname_from_fscrypt_name(fname, &name); #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); if (err) ext4_fname_free_filename(fname); #endif return err; } void ext4_fname_free_filename(struct ext4_filename *fname) { struct fscrypt_name name; name.crypto_buf = fname->crypto_buf; fscrypt_free_filename(&name); fname->crypto_buf.name = NULL; fname->usr_fname = NULL; fname->disk_name.name = NULL; #if IS_ENABLED(CONFIG_UNICODE) kfree(fname->cf_name.name); fname->cf_name.name = NULL; #endif } static bool uuid_is_zero(__u8 u[16]) { int i; for (i = 0; i < 16; i++) if (u[i]) return false; return true; } int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) { struct super_block *sb = file_inode(filp)->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); int err, err2; handle_t *handle; if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); if (err) return err; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto pwsalt_err_exit; } err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto pwsalt_err_journal; lock_buffer(sbi->s_sbh); generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); pwsalt_err_journal: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; pwsalt_err_exit: mnt_drop_write_file(filp); if (err) return err; } if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; } static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); } static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; int res, res2, credits, retries = 0; /* * Encrypting the root directory is not allowed because e2fsck expects * lost+found to exist and be unencrypted, and encrypting the root * directory would imply encrypting the lost+found directory as well as * the filename "lost+found" itself. */ if (inode->i_ino == EXT4_ROOT_INO) return -EPERM; if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) return -EOPNOTSUPP; res = ext4_convert_inline_data(inode); if (res) return res; /* * If a journal handle was specified, then the encryption context is * being set on a new inode via inheritance and is part of a larger * transaction to create the inode. Otherwise the encryption context is * being set on an existing inode in its own transaction. Only in the * latter case should the "retry on ENOSPC" logic be used. */ if (handle) { res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); /* * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ ext4_set_inode_flags(inode, false); } return res; } res = dquot_initialize(inode); if (res) return res; retry: res = ext4_xattr_set_credits(inode, len, false /* is_create */, &credits); if (res) return res; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); /* * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (!res) res = res2; return res; } static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) { return EXT4_SB(sb)->s_dummy_enc_policy.policy; } static bool ext4_has_stable_inodes(struct super_block *sb) { return ext4_has_feature_stable_inodes(sb); } const struct fscrypt_operations ext4_cryptops = { .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, .legacy_key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .has_stable_inodes = ext4_has_stable_inodes, }; |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_TYPES_H #define _LINUX_MM_TYPES_H #include <linux/mm_types_task.h> #include <linux/auxvec.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/completion.h> #include <linux/cpumask.h> #include <linux/uprobes.h> #include <linux/rcupdate.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <linux/seqlock.h> #include <linux/percpu_counter.h> #include <asm/mmu.h> #ifndef AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 0 #endif #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) #define INIT_PASID 0 struct address_space; struct mem_cgroup; /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. * * If you allocate the page using alloc_pages(), you can use some of the * space in struct page for your own purposes. The five words in the main * union are available, except for bit 0 of the first word which must be * kept clear. Many users use this word to store a pointer to an object * which is guaranteed to be aligned. If you use the same storage as * page->mapping, you must restore it to NULL before freeing the page. * * If your page will not be mapped to userspace, you can also use the four * bytes in the mapcount union, but you must call page_mapcount_reset() * before freeing it. * * If you want to use the refcount field, it must be used in such a way * that other CPUs temporarily incrementing and then decrementing the * refcount does not cause problems. On receiving the page from * alloc_pages(), the refcount will be positive. * * If you allocate pages of order > 0, you can use some of the fields * in each subpage, but you may need to restore some of their values * afterwards. * * SLUB uses cmpxchg_double() to atomically update its freelist and counters. * That requires that freelist & counters in struct slab be adjacent and * double-word aligned. Because struct slab currently just reinterprets the * bits of struct page, we align all struct pages to double-word boundaries, * and ensure that 'freelist' is aligned within struct slab. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) #else #define _struct_page_alignment __aligned(sizeof(unsigned long)) #endif struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. * WARNING: bit 0 of the first word is used for PageTail(). That * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ union { struct list_head lru; /* Or, for the Unevictable "LRU list" slot */ struct { /* Always even, to negate PageTail */ void *__filler; /* Count page's or folio's mlocks */ unsigned int mlock_count; }; /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; union { pgoff_t index; /* Our offset within mapping. */ unsigned long share; /* share count for fsdax */ }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if PageSwapCache. * Indicates order in the buddy system if PageBuddy. */ unsigned long private; }; struct { /* page_pool used by netstack */ /** * @pp_magic: magic value to avoid recycling non * page_pool allocated pages. */ unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being * mapped so the next 3 words hold the mapping, index, * and private fields from the source anonymous or * page cache page while the page is migrated to device * private memory. * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also * use the mapping, index, and private fields when * pmem backed DAX files are mapped. */ }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; }; union { /* This union is 4 bytes in size. */ /* * If the page can be mapped to userspace, encodes the number * of times this page is referenced by a page table. */ atomic_t _mapcount; /* * If the page is neither PageSlab nor mappable to userspace, * the value stored here may help determine what this page * is used for. See page-flags.h for a list of page types * which are currently stored here. */ unsigned int page_type; }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #endif /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif #ifdef CONFIG_KMSAN /* * KMSAN metadata for this page: * - shadow page: every bit indicates whether the corresponding * bit of the original page is initialized (0) or not (1); * - origin page: every 4 bytes contain an id of the stack trace * where the uninitialized value was created. */ struct page *kmsan_shadow; struct page *kmsan_origin; #endif } _struct_page_alignment; /* * struct encoded_page - a nonexistent type marking this pointer * * An 'encoded_page' pointer is a pointer to a regular 'struct page', but * with the low bits of the pointer indicating extra context-dependent * information. Not super-common, but happens in mmu_gather and mlock * handling, and this acts as a type system check on that use. * * We only really have two guaranteed bits in general, although you could * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) * for more. * * Use the supplied helper functions to endcode/decode the pointer and bits. */ struct encoded_page; #define ENCODE_PAGE_BITS 3ul static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) { BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); return (struct encoded_page *)(flags | (unsigned long)page); } static inline unsigned long encoded_page_flags(struct encoded_page *page) { return ENCODE_PAGE_BITS & (unsigned long)page; } static inline struct page *encoded_page_ptr(struct encoded_page *page) { return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); } /* * A swap entry has to fit into a "unsigned long", as the entry is hidden * in the "index" field of the swapper address space. */ typedef struct { unsigned long val; } swp_entry_t; /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. * @lru: Least Recently Used list; tracks how recently this folio was used. * @mlock_count: Number of times this folio has been pinned by mlock(). * @mapping: The file this page belongs to, or refers to the anon_vma for * anonymous memory. * @index: Offset within the file, in units of pages. For anonymous memory, * this is the index from the beginning of the mmap. * @private: Filesystem per-folio data (see folio_attach_private()). * @swap: Used for swp_entry_t if folio_test_swapcache(). * @_mapcount: Do not access this member directly. Use folio_mapcount() to * find out how many times this folio is mapped by userspace. * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. * @virtual: Virtual address in the kernel direct map. * @_last_cpupid: IDs of last CPU and last process that accessed the folio. * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). * @_deferred_list: Folios to be split under memory pressure. * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that * same power-of-two. It is at least as large as %PAGE_SIZE. If it is * in the page cache, it is at a file offset which is a multiple of that * power-of-two. It may be mapped into userspace at an address which is * at an arbitrary page offset, but its kernel virtual address is aligned * to its size. */ struct folio { /* private: don't document the anon union */ union { struct { /* public: */ unsigned long flags; union { struct list_head lru; /* private: avoid cluttering the output */ struct { void *__filler; /* public: */ unsigned int mlock_count; /* private: */ }; /* public: */ }; struct address_space *mapping; pgoff_t index; union { void *private; swp_entry_t swap; }; atomic_t _mapcount; atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #endif #if defined(WANT_PAGE_VIRTUAL) void *virtual; #endif #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif /* private: the union with struct page is transitional */ }; struct page page; }; union { struct { unsigned long _flags_1; unsigned long _head_1; unsigned long _folio_avail; /* public: */ atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif /* private: the union with struct page is transitional */ }; struct page __page_1; }; union { struct { unsigned long _flags_2; unsigned long _head_2; /* public: */ void *_hugetlb_subpool; void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; /* private: the union with struct page is transitional */ }; struct { unsigned long _flags_2a; unsigned long _head_2a; /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ }; struct page __page_2; }; }; #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); FOLIO_MATCH(lru, lru); FOLIO_MATCH(mapping, mapping); FOLIO_MATCH(compound_head, lru); FOLIO_MATCH(index, index); FOLIO_MATCH(private, private); FOLIO_MATCH(_mapcount, _mapcount); FOLIO_MATCH(_refcount, _refcount); #ifdef CONFIG_MEMCG FOLIO_MATCH(memcg_data, memcg_data); #endif #if defined(WANT_PAGE_VIRTUAL) FOLIO_MATCH(virtual, virtual); #endif #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS FOLIO_MATCH(_last_cpupid, _last_cpupid); #endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); FOLIO_MATCH(flags, _flags_2a); FOLIO_MATCH(compound_head, _head_2a); #undef FOLIO_MATCH /** * struct ptdesc - Memory descriptor for page tables. * @__page_flags: Same as page flags. Unused for page tables. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 and x86. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. * @__page_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good * understanding of the issues. */ struct ptdesc { unsigned long __page_flags; union { struct rcu_head pt_rcu_head; struct list_head pt_list; struct { unsigned long _pt_pad_1; pgtable_t pmd_huge_pte; }; }; unsigned long __page_mapping; union { struct mm_struct *pt_mm; atomic_t pt_frag_refcount; }; union { unsigned long _pt_pad_2; #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; #endif }; unsigned int __page_type; atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long pt_memcg_data; #endif }; #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) TABLE_MATCH(flags, __page_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG TABLE_MATCH(memcg_data, pt_memcg_data); #endif #undef TABLE_MATCH static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); #define ptdesc_page(pt) (_Generic((pt), \ const struct ptdesc *: (const struct page *)(pt), \ struct ptdesc *: (struct page *)(pt))) #define ptdesc_folio(pt) (_Generic((pt), \ const struct ptdesc *: (const struct folio *)(pt), \ struct ptdesc *: (struct folio *)(pt))) #define page_ptdesc(p) (_Generic((p), \ const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) /* * Used for sizing the vmemmap region on some architectures */ #define STRUCT_PAGE_MAX_SHIFT (order_base_2(sizeof(struct page))) #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) /* * page_private can be used on tail pages. However, PagePrivate is only * checked by the VM on the head page. So page_private on the tail pages * should be used for data that's ancillary to the head page (eg attaching * buffer heads to tail pages after attaching buffer heads to the head page) */ #define page_private(page) ((page)->private) static inline void set_page_private(struct page *page, unsigned long private) { page->private = private; } static inline void *folio_get_private(struct folio *folio) { return folio->private; } struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) __u16 offset; __u16 size; #else __u32 offset; #endif /* we maintain a pagecount bias, so that we dont dirty cache line * containing page->_refcount every time we allocate a fragment. */ unsigned int pagecnt_bias; bool pfmemalloc; }; typedef unsigned long vm_flags_t; /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that * map parts of them. */ struct vm_region { struct rb_node vm_rb; /* link in global region tree */ vm_flags_t vm_flags; /* VMA vm_flags */ unsigned long vm_start; /* start address of region */ unsigned long vm_end; /* region initialised to here */ unsigned long vm_top; /* region allocated to here */ unsigned long vm_pgoff; /* the offset in vm_file corresponding to vm_start */ struct file *vm_file; /* the backing file or NULL */ int vm_usage; /* region usage count (access under nommu_region_sem) */ bool vm_icache_flushed : 1; /* true if the icache has been flushed for * this region */ }; #ifdef CONFIG_USERFAULTFD #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) struct vm_userfaultfd_ctx { struct userfaultfd_ctx *ctx; }; #else /* CONFIG_USERFAULTFD */ #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ struct anon_vma_name { struct kref kref; /* The name needs to be at the end because it is dynamically sized. */ char name[]; }; #ifdef CONFIG_ANON_VMA_NAME /* * mmap_lock should be read-locked when calling anon_vma_name(). Caller should * either keep holding the lock while using the returned pointer or it should * raise anon_vma_name refcount before releasing the lock. */ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); struct anon_vma_name *anon_vma_name_alloc(const char *name); void anon_vma_name_free(struct kref *kref); #else /* CONFIG_ANON_VMA_NAME */ static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { return NULL; } static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) { return NULL; } #endif struct vma_lock { struct rw_semaphore lock; }; struct vma_numab_state { /* * Initialised as time in 'jiffies' after which VMA * should be scanned. Delays first scan of new VMA by at * least sysctl_numa_balancing_scan_delay: */ unsigned long next_scan; /* * Time in jiffies when pids_active[] is reset to * detect phase change behaviour: */ unsigned long pids_active_reset; /* * Approximate tracking of PIDs that trapped a NUMA hinting * fault. May produce false positives due to hash collisions. * * [0] Previous PID tracking * [1] Current PID tracking * * Window moves after next_pid_reset has expired approximately * every VMA_PID_RESET_PERIOD jiffies: */ unsigned long pids_active[2]; /* MM scan sequence ID when scan first started after VMA creation */ int start_scan_seq; /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq */ int prev_scan_seq; }; /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ union { struct { /* VMA covers [vm_start; vm_end) addresses within mm */ unsigned long vm_start; unsigned long vm_end; }; #ifdef CONFIG_PER_VMA_LOCK struct rcu_head vm_rcu; /* Used for deferred freeing. */ #endif }; struct mm_struct *vm_mm; /* The address space we belong to. */ pgprot_t vm_page_prot; /* Access permissions of this VMA. */ /* * Flags, see mm.h. * To modify use vm_flags_{init|reset|set|clear|mod} functions. */ union { const vm_flags_t vm_flags; vm_flags_t __private __vm_flags; }; #ifdef CONFIG_PER_VMA_LOCK /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) * - vm_lock->lock (in write mode) * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) * - vm_lock->lock (in read or write mode) * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * * This sequence counter is explicitly allowed to overflow; sequence * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ int vm_lock_seq; struct vma_lock *vm_lock; /* Flag to indicate areas detached from the mm->mm_mt tree */ bool detached; #endif /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * */ struct { struct rb_node rb; unsigned long rb_subtree_last; } shared; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack * or brk vma (with NULL file) can only be in an anon_vma list. */ struct list_head anon_vma_chain; /* Serialized by mmap_lock & * page_table_lock */ struct anon_vma *anon_vma; /* Serialized by page_table_lock */ /* Function pointers to deal with this struct. */ const struct vm_operations_struct *vm_ops; /* Information about our backing store: */ unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE units */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ #ifdef CONFIG_ANON_VMA_NAME /* * For private and shared anonymous mappings, a pointer to a null * terminated string containing the name given to the vma, or NULL if * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. */ struct anon_vma_name *anon_name; #endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif #ifndef CONFIG_MMU struct vm_region *vm_region; /* NOMMU mapping region */ #endif #ifdef CONFIG_NUMA struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy) #else #define vma_policy(vma) NULL #endif #ifdef CONFIG_SCHED_MM_CID struct mm_cid { u64 time; int cid; }; #endif struct kioctx_table; struct iommu_mm_data; struct mm_struct { struct { /* * Fields which are often written to are placed in a separate * cache line. */ struct { /** * @mm_count: The number of references to &struct * mm_struct (@mm_users count as 1). * * Use mmgrab()/mmdrop() to modify. When this drops to * 0, the &struct mm_struct is freed. */ atomic_t mm_count; } ____cacheline_aligned_in_smp; struct maple_tree mm_mt; #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* Base addresses for compatible mmap() */ unsigned long mmap_compat_base; unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ pgd_t * pgd; #ifdef CONFIG_MEMBARRIER /** * @membarrier_state: Flags controlling membarrier behavior. * * This field is close to @pgd to hopefully fit in the same * cache-line, which needs to be touched by switch_mm(). */ atomic_t membarrier_state; #endif /** * @mm_users: The number of users including userspace. * * Use mmget()/mmget_not_zero()/mmput() to modify. When this * drops to 0 (i.e. when the task exits and there are no other * temporary reference holders), we also release a reference on * @mm_count (which may then free the &struct mm_struct if * @mm_count also drops to 0). */ atomic_t mm_users; #ifdef CONFIG_SCHED_MM_CID /** * @pcpu_cid: Per-cpu current cid. * * Keep track of the currently allocated mm_cid for each cpu. * The per-cpu mm_cid values are serialized by their respective * runqueue locks. */ struct mm_cid __percpu *pcpu_cid; /* * @mm_cid_next_scan: Next mm_cid scan (in jiffies). * * When the next mm_cid scan is due (in jiffies). */ unsigned long mm_cid_next_scan; #endif #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some * counters */ /* * With some kernel config, the current mmap_lock's offset * inside 'mm_struct' is at 0x120, which is very optimal, as * its two hot fields 'count' and 'owner' sit in 2 different * cachelines, and when mmap_lock is highly contended, both * of the 2 fields will be accessed frequently, current layout * will help to reduce cache bouncing. * * So please be careful with adding new fields before * mmap_lock, which can easily push the 2 fields into one * cacheline. */ struct rw_semaphore mmap_lock; struct list_head mmlist; /* List of maybe swapped mm's. These * are globally strung together off * init_mm.mmlist, and are protected * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. * * Can be modified under write mmap_lock using RELEASE * semantics. * Can be read with no other protection when holding write * mmap_lock. * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ int mm_lock_seq; #endif unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; /** * @write_protect_seq: Locked when any thread is write * protecting pages mapped by this mm to enforce a later COW, * for instance during page table copying for fork(). */ seqcount_t write_protect_seq; spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; /* Architecture-specific MM context */ mm_context_t context; unsigned long flags; /* Must use atomic bitops to access */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in * order for it to be changed: * * current == mm->owner * current->mm != mm * new_owner->mm == mm * new_owner->alloc_lock is held */ struct task_struct __rcu *owner; #endif struct user_namespace *user_ns; /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_subscriptions *notifier_subscriptions; #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_NUMA_BALANCING /* * numa_next_scan is the next time that PTEs will be remapped * PROT_NONE to trigger NUMA hinting faults; such faults gather * statistics and migrate pages to new nodes if necessary. */ unsigned long numa_next_scan; /* Restart point for scanning and remapping PTEs. */ unsigned long numa_scan_offset; /* numa_scan_seq prevents two threads remapping PTEs. */ int numa_scan_seq; #endif /* * An operation with batched TLB flushing is going on. Anything * that can move process memory needs to flush the TLB when * moving a PROT_NONE mapped page. */ atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT struct rcu_head delayed_drop; #endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; #ifdef CONFIG_IOMMU_MM_DATA struct iommu_mm_data *iommu_mm; #endif #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM * merging (not including ksm_zero_pages). */ unsigned long ksm_merging_pages; /* * Represent how many pages are checked for ksm merging * including merged and not merged. */ unsigned long ksm_rmap_items; /* * Represent how many empty pages are merged with kernel zero * pages when enabling KSM use_zero_pages. */ unsigned long ksm_zero_pages; #endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN_WALKS_MMU struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; /* * Set when switching to this mm_struct, as a hint of * whether it has been used since the last time per-node * page table walkers cleared the corresponding bits. */ unsigned long bitmap; #ifdef CONFIG_MEMCG /* points to the memcg of "owner" above */ struct mem_cgroup *memcg; #endif } lru_gen; #endif /* CONFIG_LRU_GEN_WALKS_MMU */ } __randomize_layout; /* * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. */ unsigned long cpu_bitmap[]; }; #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { unsigned long cpu_bitmap = (unsigned long)mm; cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); cpumask_clear((struct cpumask *)cpu_bitmap); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { return (struct cpumask *)&mm->cpu_bitmap; } #ifdef CONFIG_LRU_GEN struct lru_gen_mm_list { /* mm_struct list for page table walkers */ struct list_head fifo; /* protects the list above */ spinlock_t lock; }; #endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_LRU_GEN_WALKS_MMU void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); void lru_gen_migrate_mm(struct mm_struct *mm); static inline void lru_gen_init_mm(struct mm_struct *mm) { INIT_LIST_HEAD(&mm->lru_gen.list); mm->lru_gen.bitmap = 0; #ifdef CONFIG_MEMCG mm->lru_gen.memcg = NULL; #endif } static inline void lru_gen_use_mm(struct mm_struct *mm) { /* * When the bitmap is set, page reclaim knows this mm_struct has been * used since the last time it cleared the bitmap. So it might be worth * walking the page tables of this mm_struct to clear the accessed bit. */ WRITE_ONCE(mm->lru_gen.bitmap, -1); } #else /* !CONFIG_LRU_GEN_WALKS_MMU */ static inline void lru_gen_add_mm(struct mm_struct *mm) { } static inline void lru_gen_del_mm(struct mm_struct *mm) { } static inline void lru_gen_migrate_mm(struct mm_struct *mm) { } static inline void lru_gen_init_mm(struct mm_struct *mm) { } static inline void lru_gen_use_mm(struct mm_struct *mm) { } #endif /* CONFIG_LRU_GEN_WALKS_MMU */ struct vma_iterator { struct ma_state mas; }; #define VMA_ITERATOR(name, __mm, __addr) \ struct vma_iterator name = { \ .mas = { \ .tree = &(__mm)->mm_mt, \ .index = __addr, \ .node = NULL, \ .status = ma_start, \ }, \ } static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { mas_init(&vmi->mas, &mm->mm_mt, addr); } #ifdef CONFIG_SCHED_MM_CID enum mm_cid_state { MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ MM_CID_LAZY_PUT = (1U << 31), }; static inline bool mm_cid_is_unset(int cid) { return cid == MM_CID_UNSET; } static inline bool mm_cid_is_lazy_put(int cid) { return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); } static inline bool mm_cid_is_valid(int cid) { return !(cid & MM_CID_LAZY_PUT); } static inline int mm_cid_set_lazy_put(int cid) { return cid | MM_CID_LAZY_PUT; } static inline int mm_cid_clear_lazy_put(int cid) { return cid & ~MM_CID_LAZY_PUT; } /* Accessor for struct mm_struct's cidmask. */ static inline cpumask_t *mm_cidmask(struct mm_struct *mm) { unsigned long cid_bitmap = (unsigned long)mm; cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); /* Skip cpu_bitmap */ cid_bitmap += cpumask_size(); return (struct cpumask *)cid_bitmap; } static inline void mm_init_cid(struct mm_struct *mm) { int i; for_each_possible_cpu(i) { struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); pcpu_cid->cid = MM_CID_UNSET; pcpu_cid->time = 0; } cpumask_clear(mm_cidmask(mm)); } static inline int mm_alloc_cid(struct mm_struct *mm) { mm->pcpu_cid = alloc_percpu(struct mm_cid); if (!mm->pcpu_cid) return -ENOMEM; mm_init_cid(mm); return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { free_percpu(mm->pcpu_cid); mm->pcpu_cid = NULL; } static inline unsigned int mm_cid_size(void) { return cpumask_size(); } #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm) { } static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } static inline unsigned int mm_cid_size(void) { return 0; } #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_finish_mmu(struct mmu_gather *tlb); struct vm_fault; /** * typedef vm_fault_t - Return type for page fault handlers. * * Page fault handlers return a bitmask of %VM_FAULT values. */ typedef __bitwise unsigned int vm_fault_t; /** * enum vm_fault_reason - Page fault handlers return a bitmask of * these values to tell the core VM what happened when handling the * fault. Used to decide whether a process gets delivered SIGBUS or * just gets major/minor fault counters bumped up. * * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR: Page read from storage * @VM_FAULT_HWPOISON: Hit poisoned small page * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits * @VM_FAULT_SIGSEGV: segmentation fault * @VM_FAULT_NOPAGE: ->fault installed the pte, not return page * @VM_FAULT_LOCKED: ->fault locked the returned page * @VM_FAULT_RETRY: ->fault blocked, must retry * @VM_FAULT_FALLBACK: huge page fault failed, fall back to small * @VM_FAULT_DONE_COW: ->fault has fully handled COW * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs * fsync() to complete (for synchronous page faults * in DAX) * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ enum vm_fault_reason { VM_FAULT_OOM = (__force vm_fault_t)0x000001, VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, VM_FAULT_NOPAGE = (__force vm_fault_t)0x000100, VM_FAULT_LOCKED = (__force vm_fault_t)0x000200, VM_FAULT_RETRY = (__force vm_fault_t)0x000400, VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16)) #define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf) #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | \ VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON | \ VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK) #define VM_FAULT_RESULT_TRACE \ { VM_FAULT_OOM, "OOM" }, \ { VM_FAULT_SIGBUS, "SIGBUS" }, \ { VM_FAULT_MAJOR, "MAJOR" }, \ { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ { VM_FAULT_NOPAGE, "NOPAGE" }, \ { VM_FAULT_LOCKED, "LOCKED" }, \ { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ { VM_FAULT_COMPLETED, "COMPLETED" } struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ /* * If .fault is not provided, this points to a * NULL-terminated array of pages that back the special mapping. * * This must not be NULL unless .fault is provided. */ struct page **pages; /* * If non-NULL, then this is called to resolve page faults * on the special mapping. If used, .pages is not checked. */ vm_fault_t (*fault)(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf); int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); }; enum tlb_flush_reason { TLB_FLUSH_ON_TASK_SWITCH, TLB_REMOTE_SHOOTDOWN, TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, NR_TLB_FLUSH_REASONS, }; /** * enum fault_flag - Fault flag definitions. * @FAULT_FLAG_WRITE: Fault was a write fault. * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. * @FAULT_FLAG_TRIED: The fault has been tried once. * @FAULT_FLAG_USER: The fault originated in userspace. * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a * COW mapping, making sure that an exclusive anon page is * mapped after the fault. * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two * fault flags correctly. Currently there can be three legal combinations: * * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and * this is the first try * * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and * we've already tried at least once * * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry * * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never * be used. Note that page faults can be allowed to retry for multiple times, * in which case we'll have an initial fault with flags (a) then later on * continuous faults with flags (b). We should always try to detect pending * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. * * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when * applied to mappings that are not COW mappings. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, FAULT_FLAG_MKWRITE = 1 << 1, FAULT_FLAG_ALLOW_RETRY = 1 << 2, FAULT_FLAG_RETRY_NOWAIT = 1 << 3, FAULT_FLAG_KILLABLE = 1 << 4, FAULT_FLAG_TRIED = 1 << 5, FAULT_FLAG_USER = 1 << 6, FAULT_FLAG_REMOTE = 1 << 7, FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, FAULT_FLAG_UNSHARE = 1 << 10, FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, FAULT_FLAG_VMA_LOCK = 1 << 12, }; typedef unsigned int __bitwise zap_flags_t; /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each * other. Here is what they mean, and how to use them: * * * FIXME: For pages which are part of a filesystem, mappings are subject to the * lifetime enforced by the filesystem and we need guarantees that longterm * users like RDMA and V4L2 only establish mappings which coordinate usage with * the filesystem. Ideas for this coordination include revoking the longterm * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was * added after the problem with filesystems was found FS DAX VMAs are * specifically failed. Filesystem pages are still subject to bugs and use of * FOLL_LONGTERM should be avoided on those pages. * * In the CMA case: long term pins in a CMA region would unnecessarily fragment * that region. And so, CMA attempts to migrate the page before pinning, when * FOLL_LONGTERM is specified. * * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, * but an additional pin counting system) will be invoked. This is intended for * anything that gets a page reference and then touches page data (for example, * Direct IO). This lets the filesystem know that some non-file-system entity is * potentially changing the pages' data. In contrast to FOLL_GET (whose pages * are released via put_page()), FOLL_PIN pages must be released, ultimately, by * a call to unpin_user_page(). * * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different * and separate refcounting mechanisms, however, and that means that each has * its own acquire and release mechanisms: * * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. * * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. * * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based * calls applied to them, and that's perfectly OK. This is a constraint on the * callers, not on the pages.) * * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never * directly by the caller. That's in order to help avoid mismatches when * releasing pages: get_user_pages*() pages must be released via put_page(), * while pin_user_pages*() pages must be released via unpin_user_page(). * * Please see Documentation/core-api/pin_user_pages.rst for more information. */ enum { /* check pte is writable */ FOLL_WRITE = 1 << 0, /* do get_page on page */ FOLL_GET = 1 << 1, /* give error on hole if it would be zero */ FOLL_DUMP = 1 << 2, /* get_user_pages read/write w/o permission */ FOLL_FORCE = 1 << 3, /* * if a disk transfer is needed, start the IO and return without waiting * upon it */ FOLL_NOWAIT = 1 << 4, /* do not fault in pages */ FOLL_NOFAULT = 1 << 5, /* check page is hwpoisoned */ FOLL_HWPOISON = 1 << 6, /* don't do file mappings */ FOLL_ANON = 1 << 7, /* * FOLL_LONGTERM indicates that the page will be held for an indefinite * time period _often_ under userspace control. This is in contrast to * iov_iter_get_pages(), whose usages are transient. */ FOLL_LONGTERM = 1 << 8, /* split huge pmd before returning */ FOLL_SPLIT_PMD = 1 << 9, /* allow returning PCI P2PDMA pages */ FOLL_PCI_P2PDMA = 1 << 10, /* allow interrupts from generic signals */ FOLL_INTERRUPTIBLE = 1 << 11, /* * Always honor (trigger) NUMA hinting faults. * * FOLL_WRITE implicitly honors NUMA hinting faults because a * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE * apply). get_user_pages_fast_only() always implicitly honors NUMA * hinting faults. */ FOLL_HONOR_NUMA_FAULT = 1 << 12, /* See also internal only FOLL flags in mm/internal.h */ }; #endif /* _LINUX_MM_TYPES_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | /* SPDX-License-Identifier: GPL-2.0 */ /* * sysctl.h: General linux system control interface * * Begun 24 March 1995, Stephen Tweedie * **************************************************************** **************************************************************** ** ** WARNING: ** The values in this file are exported to user space via ** the sysctl() binary interface. Do *NOT* change the ** numbering of any existing values here, and do not change ** any numbers within any one set of values. If you have to ** redefine an existing interface, use a new number for it. ** The kernel will then return -ENOTDIR to any application using ** the old binary interface. ** **************************************************************** **************************************************************** */ #ifndef _LINUX_SYSCTL_H #define _LINUX_SYSCTL_H #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/uidgid.h> #include <uapi/linux/sysctl.h> /* For the /proc/sys support */ struct completion; struct ctl_table; struct nsproxy; struct ctl_table_root; struct ctl_table_header; struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ #define SYSCTL_ZERO ((void *)&sysctl_vals[0]) #define SYSCTL_ONE ((void *)&sysctl_vals[1]) #define SYSCTL_TWO ((void *)&sysctl_vals[2]) #define SYSCTL_THREE ((void *)&sysctl_vals[3]) #define SYSCTL_FOUR ((void *)&sysctl_vals[4]) #define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) #define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) #define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7]) #define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) #define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ #define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) #define SYSCTL_NEG_ONE ((void *)&sysctl_vals[11]) extern const int sysctl_vals[]; #define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) #define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) #define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) extern const unsigned long sysctl_long_vals[]; typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dobool(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dou8vec_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *, size_t *, loff_t *); int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_do_static_key(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl * with an initialised array of struct ctl_table's. An entry with * NULL procname terminates the table. table->de will be * set up by the registration and need not be initialised in advance. * * sysctl names can be mirrored automatically under /proc/sys. The * procname supplied controls /proc naming. * * The table's mode will be honoured for proc-fs access. * * Leaf nodes in the sysctl tree will be represented by a single file * under /proc; non-leaf nodes will be represented by directories. A * null procname disables /proc mirroring at this node. * * The data and maxlen fields of the ctl_table * struct enable minimal validation of the values being written to be * performed, and the mode field allows minimal authentication. * * There must be a proc_handler routine for any terminal nodes * mirrored under /proc/sys (non-terminals are handled by a built-in * directory handler). Several default handlers are available to * cover common cases. */ /* Support for userspace poll() to watch for changes */ struct ctl_table_poll { atomic_t event; wait_queue_head_t wait; }; static inline void *proc_sys_poll_event(struct ctl_table_poll *poll) { return (void *)(unsigned long)atomic_read(&poll->event); } #define __CTL_TABLE_POLL_INITIALIZER(name) { \ .event = ATOMIC_INIT(0), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) } #define DEFINE_CTL_TABLE_POLL(name) \ struct ctl_table_poll name = __CTL_TABLE_POLL_INITIALIZER(name) /* A sysctl table is an array of struct ctl_table: */ struct ctl_table { const char *procname; /* Text ID for /proc/sys, or zero */ void *data; int maxlen; umode_t mode; /** * enum type - Enumeration to differentiate between ctl target types * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently * empty directory target to serve * as mount point. */ enum { SYSCTL_TABLE_TYPE_DEFAULT, SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY } type; proc_handler *proc_handler; /* Callback for text formatting */ struct ctl_table_poll *poll; void *extra1; void *extra2; } __randomize_layout; struct ctl_node { struct rb_node node; struct ctl_table_header *header; }; /** * struct ctl_table_header - maintains dynamic lists of struct ctl_table trees * @ctl_table: pointer to the first element in ctl_table array * @ctl_table_size: number of elements pointed by @ctl_table * @used: The entry will never be touched when equal to 0. * @count: Upped every time something is added to @inodes and downed every time * something is removed from inodes * @nreg: When nreg drops to 0 the ctl_table_header will be unregistered. * @rcu: Delays the freeing of the inode. Introduced with "unfuck proc_sysctl ->d_compare()" * */ struct ctl_table_header { union { struct { struct ctl_table *ctl_table; int ctl_table_size; int used; int count; int nreg; }; struct rcu_head rcu; }; struct completion *unregistering; struct ctl_table *ctl_table_arg; struct ctl_table_root *root; struct ctl_table_set *set; struct ctl_dir *parent; struct ctl_node *node; struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */ }; struct ctl_dir { /* Header must be at the start of ctl_dir */ struct ctl_table_header header; struct rb_root root; }; struct ctl_table_set { int (*is_seen)(struct ctl_table_set *); struct ctl_dir dir; }; struct ctl_table_root { struct ctl_table_set default_set; struct ctl_table_set *(*lookup)(struct ctl_table_root *root); void (*set_ownership)(struct ctl_table_header *head, struct ctl_table *table, kuid_t *uid, kgid_t *gid); int (*permissions)(struct ctl_table_header *head, struct ctl_table *table); }; #define register_sysctl(path, table) \ register_sysctl_sz(path, table, ARRAY_SIZE(table)) #ifdef CONFIG_SYSCTL void proc_sys_poll_notify(struct ctl_table_poll *poll); extern void setup_sysctl_set(struct ctl_table_set *p, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)); extern void retire_sysctl_set(struct ctl_table_set *set); struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, struct ctl_table *table, size_t table_size); struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table, size_t table_size); void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init_bases(void); extern void __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name, size_t table_size); #define register_sysctl_init(path, table) \ __register_sysctl_init(path, table, #table, ARRAY_SIZE(table)) extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); bool sysctl_is_alias(char *param); int do_proc_douintvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data); extern int pwrsw_enabled; extern int unaligned_enabled; extern int unaligned_dump_stack; extern int no_unaligned_warning; #else /* CONFIG_SYSCTL */ static inline void register_sysctl_init(const char *path, struct ctl_table *table) { } static inline struct ctl_table_header *register_sysctl_mount_point(const char *path) { return NULL; } static inline struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table, size_t table_size) { return NULL; } static inline void unregister_sysctl_table(struct ctl_table_header * table) { } static inline void setup_sysctl_set(struct ctl_table_set *p, struct ctl_table_root *root, int (*is_seen)(struct ctl_table_set *)) { } static inline void do_sysctl_args(void) { } static inline bool sysctl_is_alias(char *param) { return false; } #endif /* CONFIG_SYSCTL */ int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif /* _LINUX_SYSCTL_H */ |
| 329 329 1103 28 329 329 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_BL_H #define _LINUX_RCULIST_BL_H /* * RCU-protected bl list version. See include/linux/list_bl.h. */ #include <linux/list_bl.h> #include <linux/rcupdate.h> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_bl_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry(). */ static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) { __hlist_bl_del(n); n->pprev = LIST_POISON2; } /** * hlist_bl_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_bl, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); } /** * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_bl_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_bl_node within the struct. * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = hlist_bl_first_rcu(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(pos->next)) #endif |
| 15 15 12 49 43 49 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 47 49 47 49 52 48 50 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 | // SPDX-License-Identifier: GPL-2.0 /* * inode.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org> * * debugfs is for people to use instead of /proc or /sys. * See ./Documentation/core-api/kernel-api.rst for more details. */ #define pr_fmt(fmt) "debugfs: " fmt #include <linux/module.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/kobject.h> #include <linux/namei.h> #include <linux/debugfs.h> #include <linux/fsnotify.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/security.h> #include "internal.h" #define DEBUGFS_DEFAULT_MODE 0700 static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; /* * Don't allow access attributes to be changed whilst the kernel is locked down * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ static int debugfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *ia) { int ret; if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { ret = security_locked_down(LOCKDOWN_DEBUGFS); if (ret) return ret; } return simple_setattr(&nop_mnt_idmap, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_dir_inode_operations = { .lookup = simple_lookup, .setattr = debugfs_setattr, }; static const struct inode_operations debugfs_symlink_inode_operations = { .get_link = simple_get_link, .setattr = debugfs_setattr, }; static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); } return inode; } struct debugfs_mount_opts { kuid_t uid; kgid_t gid; umode_t mode; /* Opt_* bitfield. */ unsigned int opts; }; enum { Opt_uid, Opt_gid, Opt_mode, Opt_err }; static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, {Opt_err, NULL} }; struct debugfs_fs_info { struct debugfs_mount_opts mount_opts; }; static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) { substring_t args[MAX_OPT_ARGS]; int option; int token; kuid_t uid; kgid_t gid; char *p; opts->opts = 0; opts->mode = DEBUGFS_DEFAULT_MODE; while ((p = strsep(&data, ",")) != NULL) { if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; uid = make_kuid(current_user_ns(), option); if (!uid_valid(uid)) return -EINVAL; opts->uid = uid; break; case Opt_gid: if (match_int(&args[0], &option)) return -EINVAL; gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) return -EINVAL; opts->gid = gid; break; case Opt_mode: if (match_octal(&args[0], &option)) return -EINVAL; opts->mode = option & S_IALLUGO; break; /* * We might like to report bad mount options here; * but traditionally debugfs has ignored all mount options */ } opts->opts |= BIT(token); } return 0; } static void _debugfs_apply_options(struct super_block *sb, bool remount) { struct debugfs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct debugfs_mount_opts *opts = &fsi->mount_opts; /* * On remount, only reset mode/uid/gid if they were provided as mount * options. */ if (!remount || opts->opts & BIT(Opt_mode)) { inode->i_mode &= ~S_IALLUGO; inode->i_mode |= opts->mode; } if (!remount || opts->opts & BIT(Opt_uid)) inode->i_uid = opts->uid; if (!remount || opts->opts & BIT(Opt_gid)) inode->i_gid = opts->gid; } static void debugfs_apply_options(struct super_block *sb) { _debugfs_apply_options(sb, false); } static void debugfs_apply_options_remount(struct super_block *sb) { _debugfs_apply_options(sb, true); } static int debugfs_remount(struct super_block *sb, int *flags, char *data) { int err; struct debugfs_fs_info *fsi = sb->s_fs_info; sync_filesystem(sb); err = debugfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; debugfs_apply_options_remount(sb); fail: return err; } static int debugfs_show_options(struct seq_file *m, struct dentry *root) { struct debugfs_fs_info *fsi = root->d_sb->s_fs_info; struct debugfs_mount_opts *opts = &fsi->mount_opts; if (!uid_eq(opts->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid)); if (!gid_eq(opts->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); if (opts->mode != DEBUGFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", opts->mode); return 0; } static void debugfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); free_inode_nonrcu(inode); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, .free_inode = debugfs_free_inode, }; static void debugfs_release_dentry(struct dentry *dentry) { struct debugfs_fsdata *fsd = dentry->d_fsdata; if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) return; /* check it wasn't a dir (no fsdata) or automount (no real_fops) */ if (fsd && fsd->real_fops) { WARN_ON(!list_empty(&fsd->cancellations)); mutex_destroy(&fsd->cancellations_mtx); } kfree(fsd); } static struct vfsmount *debugfs_automount(struct path *path) { struct debugfs_fsdata *fsd = path->dentry->d_fsdata; return fsd->automount(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { .d_delete = always_delete_dentry, .d_release = debugfs_release_dentry, .d_automount = debugfs_automount, }; static int debug_fill_super(struct super_block *sb, void *data, int silent) { static const struct tree_descr debug_files[] = {{""}}; struct debugfs_fs_info *fsi; int err; fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL); sb->s_fs_info = fsi; if (!fsi) { err = -ENOMEM; goto fail; } err = debugfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); if (err) goto fail; sb->s_op = &debugfs_super_operations; sb->s_d_op = &debugfs_dops; debugfs_apply_options(sb); return 0; fail: kfree(fsi); sb->s_fs_info = NULL; return err; } static struct dentry *debug_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); return mount_single(fs_type, flags, data, debug_fill_super); } static struct file_system_type debug_fs_type = { .owner = THIS_MODULE, .name = "debugfs", .mount = debug_mount, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("debugfs"); /** * debugfs_lookup() - look up an existing debugfs file * @name: a pointer to a string containing the name of the file to look up. * @parent: a pointer to the parent dentry of the file. * * This function will return a pointer to a dentry if it succeeds. If the file * doesn't exist or an error occurs, %NULL will be returned. The returned * dentry must be passed to dput() when it is no longer needed. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) { struct dentry *dentry; if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent)) return NULL; if (!parent) parent = debugfs_mount->mnt_root; dentry = lookup_positive_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) return NULL; return dentry; } EXPORT_SYMBOL_GPL(debugfs_lookup); static struct dentry *start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); if (!debugfs_initialized()) return ERR_PTR(-ENOENT); pr_debug("creating file '%s'\n", name); if (IS_ERR(parent)) return parent; error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) { pr_err("Unable to pin filesystem for file '%s'\n", name); return ERR_PTR(error); } /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ if (!parent) parent = debugfs_mount->mnt_root; inode_lock(d_inode(parent)); if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", name, parent->d_name.name); else pr_err("File '%s' in directory '%s' already present!\n", name, parent->d_name.name); dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) { inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } return dentry; } static struct dentry *failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return ERR_PTR(-ENOMEM); } static struct dentry *end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; } static struct dentry *__debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *proxy_fops, const struct file_operations *real_fops) { struct dentry *dentry; struct inode *inode; if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); dentry = start_creating(name, parent); if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create file '%s'\n", name); return failed_creating(dentry); } inode->i_mode = mode; inode->i_private = data; inode->i_op = &debugfs_file_inode_operations; inode->i_fop = proxy_fops; dentry->d_fsdata = (void *)((unsigned long)real_fops | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } /** * debugfs_create_file - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * This is the basic "create a file" function for debugfs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. * * NOTE: it's expected that most callers should _ignore_ the errors returned * by this function. Other debugfs functions handle the fact that the "dentry" * passed to them could be an error and they don't crash in that case. * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, fops ? &debugfs_full_proxy_file_operations : &debugfs_noop_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file); /** * debugfs_create_file_unsafe - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * debugfs_create_file_unsafe() is completely analogous to * debugfs_create_file(), the only difference being that the fops * handed it will not get protected against file removals by the * debugfs core. * * It is your responsibility to protect your struct file_operation * methods against file removals by means of debugfs_file_get() * and debugfs_file_put(). ->open() is still protected by * debugfs though. * * Any struct file_operations defined by means of * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and * thus, may be used here. */ struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { return __debugfs_create_file(name, mode, parent, data, fops ? &debugfs_open_proxy_file_operations : &debugfs_noop_file_operations, fops); } EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); /** * debugfs_create_file_size - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * @file_size: initial file size * * This is the basic "create a file" function for debugfs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the debugfs_create_dir() function is * recommended to be used instead.) */ void debugfs_create_file_size(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops, loff_t file_size) { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (!IS_ERR(de)) d_inode(de)->i_size = file_size; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); /** * debugfs_create_dir - create a directory in the debugfs filesystem * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * directory will be created in the root of the debugfs filesystem. * * This function creates a directory in debugfs with the given name. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. * * NOTE: it's expected that most callers should _ignore_ the errors returned * by this function. Other debugfs functions handle the fact that the "dentry" * passed to them could be an error and they don't crash in that case. * Drivers should generally work fine even if debugfs fails to init anyway. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { struct dentry *dentry = start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) return dentry; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create directory '%s'\n", name); return failed_creating(dentry); } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); /** * debugfs_create_automount - create automount point in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the debugfs filesystem. * @f: function to be called when pathname resolution steps on that one. * @data: opaque argument to pass to f(). * * @f should return what ->d_automount() would. */ struct dentry *debugfs_create_automount(const char *name, struct dentry *parent, debugfs_automount_t f, void *data) { struct dentry *dentry = start_creating(name, parent); struct debugfs_fsdata *fsd; struct inode *inode; if (IS_ERR(dentry)) return dentry; fsd = kzalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) { failed_creating(dentry); return ERR_PTR(-ENOMEM); } fsd->automount = f; if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); kfree(fsd); return ERR_PTR(-EPERM); } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", name); kfree(fsd); return failed_creating(dentry); } make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; dentry->d_fsdata = fsd; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); /** * debugfs_create_symlink- create a symbolic link in the debugfs filesystem * @name: a pointer to a string containing the name of the symbolic link to * create. * @parent: a pointer to the parent dentry for this symbolic link. This * should be a directory dentry if set. If this parameter is NULL, * then the symbolic link will be created in the root of the debugfs * filesystem. * @target: a pointer to a string containing the path to the target of the * symbolic link. * * This function creates a symbolic link with the given name in debugfs that * links to the given target path. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the symbolic * link is to be removed (no automatic cleanup happens if your module is * unloaded, you are responsible here.) If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *target) { struct dentry *dentry; struct inode *inode; char *link = kstrdup(target, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); dentry = start_creating(name, parent); if (IS_ERR(dentry)) { kfree(link); return dentry; } inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create symlink '%s'\n", name); kfree(link); return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); static void __debugfs_file_removed(struct dentry *dentry) { struct debugfs_fsdata *fsd; /* * Paired with the closing smp_mb() implied by a successful * cmpxchg() in debugfs_file_get(): either * debugfs_file_get() must see a dead dentry or we must see a * debugfs_fsdata instance at ->d_fsdata here (or both). */ smp_mb(); fsd = READ_ONCE(dentry->d_fsdata); if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) return; /* if we hit zero, just wait for all to finish */ if (!refcount_dec_and_test(&fsd->active_users)) { wait_for_completion(&fsd->active_users_drained); return; } /* if we didn't hit zero, try to cancel any we can */ while (refcount_read(&fsd->active_users)) { struct debugfs_cancellation *c; /* * Lock the cancellations. Note that the cancellations * structs are meant to be on the stack, so we need to * ensure we either use them here or don't touch them, * and debugfs_leave_cancellation() will wait for this * to be finished processing before exiting one. It may * of course win and remove the cancellation, but then * chances are we never even got into this bit, we only * do if the refcount isn't zero already. */ mutex_lock(&fsd->cancellations_mtx); while ((c = list_first_entry_or_null(&fsd->cancellations, typeof(*c), list))) { list_del_init(&c->list); c->cancel(dentry, c->cancel_data); } mutex_unlock(&fsd->cancellations_mtx); wait_for_completion(&fsd->active_users_drained); } } static void remove_one(struct dentry *victim) { if (d_is_reg(victim)) __debugfs_file_removed(victim); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } /** * debugfs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. If this * parameter is NULL or an error value, nothing will be done. * * This function recursively removes a directory tree in debugfs that * was previously created with a call to another debugfs function * (like debugfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed, no automatic cleanup of files will happen when a module is * removed, you are responsible here. */ void debugfs_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it * @name: a pointer to a string containing the name of the item to look up. * @parent: a pointer to the parent dentry of the item. * * This is the equlivant of doing something like * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting * handled for the directory being looked up. */ void debugfs_lookup_and_remove(const char *name, struct dentry *parent) { struct dentry *dentry; dentry = debugfs_lookup(name, parent); if (!dentry) return; debugfs_remove(dentry); dput(dentry); } EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove); /** * debugfs_rename - rename a file/directory in the debugfs filesystem * @old_dir: a pointer to the parent dentry for the renamed object. This * should be a directory dentry. * @old_dentry: dentry of an object to be renamed. * @new_dir: a pointer to the parent dentry where the object should be * moved. This should be a directory dentry. * @new_name: a pointer to a string containing the target name. * * This function renames a file/directory in debugfs. The target must not * exist for rename to succeed. * * This function will return a pointer to old_dentry (which is updated to * reflect renaming) if it succeeds. If an error occurs, ERR_PTR(-ERROR) * will be returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, const char *new_name) { int error; struct dentry *dentry = NULL, *trap; struct name_snapshot old_name; if (IS_ERR(old_dir)) return old_dir; if (IS_ERR(new_dir)) return new_dir; if (IS_ERR_OR_NULL(old_dentry)) return old_dentry; trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) goto exit; /* Source does not exist, cyclic rename, or mountpoint? */ if (d_really_is_negative(old_dentry) || old_dentry == trap || d_mountpoint(old_dentry)) goto exit; dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); /* Lookup failed, cyclic rename or target exists? */ if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; take_dentry_name_snapshot(&old_name, old_dentry); error = simple_rename(&nop_mnt_idmap, d_inode(old_dir), old_dentry, d_inode(new_dir), dentry, 0); if (error) { release_dentry_name_snapshot(&old_name); goto exit; } d_move(old_dentry, dentry); fsnotify_move(d_inode(old_dir), d_inode(new_dir), &old_name.name, d_is_dir(old_dentry), NULL, old_dentry); release_dentry_name_snapshot(&old_name); unlock_rename(new_dir, old_dir); dput(dentry); return old_dentry; exit: if (dentry && !IS_ERR(dentry)) dput(dentry); unlock_rename(new_dir, old_dir); if (IS_ERR(dentry)) return dentry; return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(debugfs_rename); /** * debugfs_initialized - Tells whether debugfs has been registered */ bool debugfs_initialized(void) { return debugfs_registered; } EXPORT_SYMBOL_GPL(debugfs_initialized); static int __init debugfs_kernel(char *str) { if (str) { if (!strcmp(str, "on")) debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT; else if (!strcmp(str, "no-mount")) debugfs_allow = DEBUGFS_ALLOW_API; else if (!strcmp(str, "off")) debugfs_allow = 0; } return 0; } early_param("debugfs", debugfs_kernel); static int __init debugfs_init(void) { int retval; if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT)) return -EPERM; retval = sysfs_create_mount_point(kernel_kobj, "debug"); if (retval) return retval; retval = register_filesystem(&debug_fs_type); if (retval) sysfs_remove_mount_point(kernel_kobj, "debug"); else debugfs_registered = true; return retval; } core_initcall(debugfs_init); |
| 19 18 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BSEARCH_H #define _LINUX_BSEARCH_H #include <linux/types.h> static __always_inline void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { const char *pivot; int result; while (num > 0) { pivot = base + (num >> 1) * size; result = cmp(key, pivot); if (result == 0) return (void *)pivot; if (result > 0) { base = pivot + size; num--; } num >>= 1; } return NULL; } extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp); #endif /* _LINUX_BSEARCH_H */ |
| 1160 1160 1160 498 498 9 9 9 378 5 5 9 9 9 9 9 9 20 20 12 9 47 9 498 497 498 374 373 375 374 374 115 115 115 119 119 119 119 239 239 1105 1107 50 50 50 50 50 686 518 1161 1161 1161 476 476 476 477 622 1071 1070 1071 50 1 1 90 90 91 1004 1004 1004 12 206 206 470 470 225 56 56 56 56 55 355 333 30 43 242 220 269 225 337 63 63 74 73 22 22 4 4 18 57 99 98 49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Security plug functions * * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com> * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2016 Mellanox Technologies * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com> */ #define pr_fmt(fmt) "LSM: " fmt #include <linux/bpf.h> #include <linux/capability.h> #include <linux/dcache.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/lsm_hooks.h> #include <linux/integrity.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsnotify.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/personality.h> #include <linux/backing-dev.h> #include <linux/string.h> #include <linux/msg.h> #include <net/flow.h> /* How many LSMs were built into the kernel? */ #define LSM_COUNT (__end_lsm_info - __start_lsm_info) /* * How many LSMs are built into the kernel as determined at * build time. Used to determine fixed array sizes. * The capability module is accounted for by CONFIG_SECURITY */ #define LSM_CONFIG_COUNT ( \ (IS_ENABLED(CONFIG_SECURITY) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_SELINUX) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_SMACK) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_TOMOYO) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_APPARMOR) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_YAMA) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_LOADPIN) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_SAFESETID) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_LOCKDOWN_LSM) ? 1 : 0) + \ (IS_ENABLED(CONFIG_BPF_LSM) ? 1 : 0) + \ (IS_ENABLED(CONFIG_SECURITY_LANDLOCK) ? 1 : 0)) /* * These are descriptions of the reasons that can be passed to the * security_locked_down() LSM hook. Placing this array here allows * all security modules to use the same descriptions for auditing * purposes. */ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_EFI_TEST] = "/dev/efi_test access", [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM", [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM", [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access", [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; struct security_hook_heads security_hook_heads __ro_after_init; static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain); static struct kmem_cache *lsm_file_cache; static struct kmem_cache *lsm_inode_cache; char *lsm_names; static struct lsm_blob_sizes blob_sizes __ro_after_init; /* Boot-time LSM user choice */ static __initdata const char *chosen_lsm_order; static __initdata const char *chosen_major_lsm; static __initconst const char *const builtin_lsm_order = CONFIG_LSM; /* Ordered list of LSMs to initialize. */ static __initdata struct lsm_info **ordered_lsms; static __initdata struct lsm_info *exclusive; static __initdata bool debug; #define init_debug(...) \ do { \ if (debug) \ pr_info(__VA_ARGS__); \ } while (0) static bool __init is_enabled(struct lsm_info *lsm) { if (!lsm->enabled) return false; return *lsm->enabled; } /* Mark an LSM's enabled flag. */ static int lsm_enabled_true __initdata = 1; static int lsm_enabled_false __initdata = 0; static void __init set_enabled(struct lsm_info *lsm, bool enabled) { /* * When an LSM hasn't configured an enable variable, we can use * a hard-coded location for storing the default enabled state. */ if (!lsm->enabled) { if (enabled) lsm->enabled = &lsm_enabled_true; else lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_true) { if (!enabled) lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_false) { if (enabled) lsm->enabled = &lsm_enabled_true; } else { *lsm->enabled = enabled; } } /* Is an LSM already listed in the ordered LSMs list? */ static bool __init exists_ordered_lsm(struct lsm_info *lsm) { struct lsm_info **check; for (check = ordered_lsms; *check; check++) if (*check == lsm) return true; return false; } /* Append an LSM to the list of ordered LSMs to initialize. */ static int last_lsm __initdata; static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from) { /* Ignore duplicate selections. */ if (exists_ordered_lsm(lsm)) return; if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from)) return; /* Enable this LSM, if it is not already set. */ if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; ordered_lsms[last_lsm++] = lsm; init_debug("%s ordered: %s (%s)\n", from, lsm->name, is_enabled(lsm) ? "enabled" : "disabled"); } /* Is an LSM allowed to be initialized? */ static bool __init lsm_allowed(struct lsm_info *lsm) { /* Skip if the LSM is disabled. */ if (!is_enabled(lsm)) return false; /* Not allowed if another exclusive LSM already initialized. */ if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) { init_debug("exclusive disabled: %s\n", lsm->name); return false; } return true; } static void __init lsm_set_blob_size(int *need, int *lbs) { int offset; if (*need <= 0) return; offset = ALIGN(*lbs, sizeof(void *)); *lbs = offset + *need; *need = offset; } static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) { if (!needed) return; lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); /* * The inode blob gets an rcu_head in addition to * what the modules might need. */ if (needed->lbs_inode && blob_sizes.lbs_inode == 0) blob_sizes.lbs_inode = sizeof(struct rcu_head); lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode); lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc); lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg); lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock); lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task); lsm_set_blob_size(&needed->lbs_xattr_count, &blob_sizes.lbs_xattr_count); } /* Prepare LSM for initialization. */ static void __init prepare_lsm(struct lsm_info *lsm) { int enabled = lsm_allowed(lsm); /* Record enablement (to handle any following exclusive LSMs). */ set_enabled(lsm, enabled); /* If enabled, do pre-initialization work. */ if (enabled) { if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) { exclusive = lsm; init_debug("exclusive chosen: %s\n", lsm->name); } lsm_set_blob_sizes(lsm->blobs); } } /* Initialize a given LSM, if it is enabled. */ static void __init initialize_lsm(struct lsm_info *lsm) { if (is_enabled(lsm)) { int ret; init_debug("initializing %s\n", lsm->name); ret = lsm->init(); WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); } } /* * Current index to use while initializing the lsm id list. */ u32 lsm_active_cnt __ro_after_init; const struct lsm_id *lsm_idlist[LSM_CONFIG_COUNT]; /* Populate ordered LSMs list from comma-separated LSM name list. */ static void __init ordered_lsm_parse(const char *order, const char *origin) { struct lsm_info *lsm; char *sep, *name, *next; /* LSM_ORDER_FIRST is always first. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_FIRST) append_ordered_lsm(lsm, " first"); } /* Process "security=", if given. */ if (chosen_major_lsm) { struct lsm_info *major; /* * To match the original "security=" behavior, this * explicitly does NOT fallback to another Legacy Major * if the selected one was separately disabled: disable * all non-matching Legacy Major LSMs. */ for (major = __start_lsm_info; major < __end_lsm_info; major++) { if ((major->flags & LSM_FLAG_LEGACY_MAJOR) && strcmp(major->name, chosen_major_lsm) != 0) { set_enabled(major, false); init_debug("security=%s disabled: %s (only one legacy major LSM)\n", chosen_major_lsm, major->name); } } } sep = kstrdup(order, GFP_KERNEL); next = sep; /* Walk the list, looking for matching LSMs. */ while ((name = strsep(&next, ",")) != NULL) { bool found = false; for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (strcmp(lsm->name, name) == 0) { if (lsm->order == LSM_ORDER_MUTABLE) append_ordered_lsm(lsm, origin); found = true; } } if (!found) init_debug("%s ignored: %s (not built into kernel)\n", origin, name); } /* Process "security=", if given. */ if (chosen_major_lsm) { for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; if (strcmp(lsm->name, chosen_major_lsm) == 0) append_ordered_lsm(lsm, "security="); } } /* LSM_ORDER_LAST is always last. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_LAST) append_ordered_lsm(lsm, " last"); } /* Disable all LSMs not in the ordered list. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; set_enabled(lsm, false); init_debug("%s skipped: %s (not in requested order)\n", origin, lsm->name); } kfree(sep); } static void __init lsm_early_cred(struct cred *cred); static void __init lsm_early_task(struct task_struct *task); static int lsm_append(const char *new, char **result); static void __init report_lsm_order(void) { struct lsm_info **lsm, *early; int first = 0; pr_info("initializing lsm="); /* Report each enabled LSM name, comma separated. */ for (early = __start_early_lsm_info; early < __end_early_lsm_info; early++) if (is_enabled(early)) pr_cont("%s%s", first++ == 0 ? "" : ",", early->name); for (lsm = ordered_lsms; *lsm; lsm++) if (is_enabled(*lsm)) pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name); pr_cont("\n"); } static void __init ordered_lsm_init(void) { struct lsm_info **lsm; ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms), GFP_KERNEL); if (chosen_lsm_order) { if (chosen_major_lsm) { pr_warn("security=%s is ignored because it is superseded by lsm=%s\n", chosen_major_lsm, chosen_lsm_order); chosen_major_lsm = NULL; } ordered_lsm_parse(chosen_lsm_order, "cmdline"); } else ordered_lsm_parse(builtin_lsm_order, "builtin"); for (lsm = ordered_lsms; *lsm; lsm++) prepare_lsm(*lsm); report_lsm_order(); init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); init_debug("file blob size = %d\n", blob_sizes.lbs_file); init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock); init_debug("task blob size = %d\n", blob_sizes.lbs_task); init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); /* * Create any kmem_caches needed for blobs */ if (blob_sizes.lbs_file) lsm_file_cache = kmem_cache_create("lsm_file_cache", blob_sizes.lbs_file, 0, SLAB_PANIC, NULL); if (blob_sizes.lbs_inode) lsm_inode_cache = kmem_cache_create("lsm_inode_cache", blob_sizes.lbs_inode, 0, SLAB_PANIC, NULL); lsm_early_cred((struct cred *) current->cred); lsm_early_task(current); for (lsm = ordered_lsms; *lsm; lsm++) initialize_lsm(*lsm); kfree(ordered_lsms); } int __init early_security_init(void) { struct lsm_info *lsm; #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ INIT_HLIST_HEAD(&security_hook_heads.NAME); #include "linux/lsm_hook_defs.h" #undef LSM_HOOK for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; prepare_lsm(lsm); initialize_lsm(lsm); } return 0; } /** * security_init - initializes the security framework * * This should be called early in the kernel initialization sequence. */ int __init security_init(void) { struct lsm_info *lsm; init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*"); init_debug(" CONFIG_LSM=%s\n", builtin_lsm_order); init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*"); /* * Append the names of the early LSM modules now that kmalloc() is * available */ for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { init_debug(" early started: %s (%s)\n", lsm->name, is_enabled(lsm) ? "enabled" : "disabled"); if (lsm->enabled) lsm_append(lsm->name, &lsm_names); } /* Load LSMs in specified order. */ ordered_lsm_init(); return 0; } /* Save user chosen LSM */ static int __init choose_major_lsm(char *str) { chosen_major_lsm = str; return 1; } __setup("security=", choose_major_lsm); /* Explicitly choose LSM initialization order. */ static int __init choose_lsm_order(char *str) { chosen_lsm_order = str; return 1; } __setup("lsm=", choose_lsm_order); /* Enable LSM order debugging. */ static int __init enable_debug(char *str) { debug = true; return 1; } __setup("lsm.debug", enable_debug); static bool match_last_lsm(const char *list, const char *lsm) { const char *last; if (WARN_ON(!list || !lsm)) return false; last = strrchr(list, ','); if (last) /* Pass the comma, strcmp() will check for '\0' */ last++; else last = list; return !strcmp(last, lsm); } static int lsm_append(const char *new, char **result) { char *cp; if (*result == NULL) { *result = kstrdup(new, GFP_KERNEL); if (*result == NULL) return -ENOMEM; } else { /* Check if it is the last registered name */ if (match_last_lsm(*result, new)) return 0; cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new); if (cp == NULL) return -ENOMEM; kfree(*result); *result = cp; } return 0; } /** * security_add_hooks - Add a modules hooks to the hook lists. * @hooks: the hooks to add * @count: the number of hooks to add * @lsmid: the identification information for the security module * * Each LSM has to register its hooks with the infrastructure. */ void __init security_add_hooks(struct security_hook_list *hooks, int count, const struct lsm_id *lsmid) { int i; /* * A security module may call security_add_hooks() more * than once during initialization, and LSM initialization * is serialized. Landlock is one such case. * Look at the previous entry, if there is one, for duplication. */ if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) { if (lsm_active_cnt >= LSM_CONFIG_COUNT) panic("%s Too many LSMs registered.\n", __func__); lsm_idlist[lsm_active_cnt++] = lsmid; } for (i = 0; i < count; i++) { hooks[i].lsmid = lsmid; hlist_add_tail_rcu(&hooks[i].list, hooks[i].head); } /* * Don't try to append during early_security_init(), we'll come back * and fix this up afterwards. */ if (slab_is_available()) { if (lsm_append(lsmid->name, &lsm_names) < 0) panic("%s - Cannot get early memory.\n", __func__); } } int call_blocking_lsm_notifier(enum lsm_event event, void *data) { return blocking_notifier_call_chain(&blocking_lsm_notifier_chain, event, data); } EXPORT_SYMBOL(call_blocking_lsm_notifier); int register_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(register_blocking_lsm_notifier); int unregister_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(unregister_blocking_lsm_notifier); /** * lsm_cred_alloc - allocate a composite cred blob * @cred: the cred that needs a blob * @gfp: allocation type * * Allocate the cred blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_cred_alloc(struct cred *cred, gfp_t gfp) { if (blob_sizes.lbs_cred == 0) { cred->security = NULL; return 0; } cred->security = kzalloc(blob_sizes.lbs_cred, gfp); if (cred->security == NULL) return -ENOMEM; return 0; } /** * lsm_early_cred - during initialization allocate a composite cred blob * @cred: the cred that needs a blob * * Allocate the cred blob for all the modules */ static void __init lsm_early_cred(struct cred *cred) { int rc = lsm_cred_alloc(cred, GFP_KERNEL); if (rc) panic("%s: Early cred alloc failed.\n", __func__); } /** * lsm_file_alloc - allocate a composite file blob * @file: the file that needs a blob * * Allocate the file blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_file_alloc(struct file *file) { if (!lsm_file_cache) { file->f_security = NULL; return 0; } file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL); if (file->f_security == NULL) return -ENOMEM; return 0; } /** * lsm_inode_alloc - allocate a composite inode blob * @inode: the inode that needs a blob * * Allocate the inode blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ int lsm_inode_alloc(struct inode *inode) { if (!lsm_inode_cache) { inode->i_security = NULL; return 0; } inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS); if (inode->i_security == NULL) return -ENOMEM; return 0; } /** * lsm_task_alloc - allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_task_alloc(struct task_struct *task) { if (blob_sizes.lbs_task == 0) { task->security = NULL; return 0; } task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL); if (task->security == NULL) return -ENOMEM; return 0; } /** * lsm_ipc_alloc - allocate a composite ipc blob * @kip: the ipc that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_ipc_alloc(struct kern_ipc_perm *kip) { if (blob_sizes.lbs_ipc == 0) { kip->security = NULL; return 0; } kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL); if (kip->security == NULL) return -ENOMEM; return 0; } /** * lsm_msg_msg_alloc - allocate a composite msg_msg blob * @mp: the msg_msg that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_msg_msg_alloc(struct msg_msg *mp) { if (blob_sizes.lbs_msg_msg == 0) { mp->security = NULL; return 0; } mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL); if (mp->security == NULL) return -ENOMEM; return 0; } /** * lsm_early_task - during initialization allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules */ static void __init lsm_early_task(struct task_struct *task) { int rc = lsm_task_alloc(task); if (rc) panic("%s: Early task alloc failed.\n", __func__); } /** * lsm_superblock_alloc - allocate a composite superblock blob * @sb: the superblock that needs a blob * * Allocate the superblock blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_superblock_alloc(struct super_block *sb) { if (blob_sizes.lbs_superblock == 0) { sb->s_security = NULL; return 0; } sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL); if (sb->s_security == NULL) return -ENOMEM; return 0; } /** * lsm_fill_user_ctx - Fill a user space lsm_ctx structure * @uctx: a userspace LSM context to be filled * @uctx_len: available uctx size (input), used uctx size (output) * @val: the new LSM context value * @val_len: the size of the new LSM context value * @id: LSM id * @flags: LSM defined flags * * Fill all of the fields in a userspace lsm_ctx structure. * * Returns 0 on success, -E2BIG if userspace buffer is not large enough, * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated. */ int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len, void *val, size_t val_len, u64 id, u64 flags) { struct lsm_ctx *nctx = NULL; size_t nctx_len; int rc = 0; nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *)); if (nctx_len > *uctx_len) { rc = -E2BIG; goto out; } nctx = kzalloc(nctx_len, GFP_KERNEL); if (nctx == NULL) { rc = -ENOMEM; goto out; } nctx->id = id; nctx->flags = flags; nctx->len = nctx_len; nctx->ctx_len = val_len; memcpy(nctx->ctx, val, val_len); if (copy_to_user(uctx, nctx, nctx_len)) rc = -EFAULT; out: kfree(nctx); *uctx_len = nctx_len; return rc; } /* * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and * can be accessed with: * * LSM_RET_DEFAULT(<hook_name>) * * The macros below define static constants for the default value of each * LSM hook. */ #define LSM_RET_DEFAULT(NAME) (NAME##_default) #define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME) #define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \ static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT); #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME) #include <linux/lsm_hook_defs.h> #undef LSM_HOOK /* * Hook list operation macros. * * call_void_hook: * This is a hook that does not return a value. * * call_int_hook: * This is a hook that returns a value. */ #define call_void_hook(FUNC, ...) \ do { \ struct security_hook_list *P; \ \ hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \ P->hook.FUNC(__VA_ARGS__); \ } while (0) #define call_int_hook(FUNC, IRC, ...) ({ \ int RC = IRC; \ do { \ struct security_hook_list *P; \ \ hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \ RC = P->hook.FUNC(__VA_ARGS__); \ if (RC != 0) \ break; \ } \ } while (0); \ RC; \ }) /* Security operations */ /** * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok * @mgr: task credentials of current binder process * * Check whether @mgr is allowed to be the binder context manager. * * Return: Return 0 if permission is granted. */ int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, 0, mgr); } /** * security_binder_transaction() - Check if a binder transaction is allowed * @from: sending process * @to: receiving process * * Check whether @from is allowed to invoke a binder transaction call to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transaction(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transaction, 0, from, to); } /** * security_binder_transfer_binder() - Check if a binder transfer is allowed * @from: sending process * @to: receiving process * * Check whether @from is allowed to transfer a binder reference to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transfer_binder(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transfer_binder, 0, from, to); } /** * security_binder_transfer_file() - Check if a binder file xfer is allowed * @from: sending process * @to: receiving process * @file: file being transferred * * Check whether @from is allowed to transfer @file to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transfer_file(const struct cred *from, const struct cred *to, const struct file *file) { return call_int_hook(binder_transfer_file, 0, from, to, file); } /** * security_ptrace_access_check() - Check if tracing is allowed * @child: target process * @mode: PTRACE_MODE flags * * Check permission before allowing the current process to trace the @child * process. Security modules may also want to perform a process tracing check * during an execve in the set_security or apply_creds hooks of tracing check * during an execve in the bprm_set_creds hook of binprm_security_ops if the * process is being traced and its security attributes would be changed by the * execve. * * Return: Returns 0 if permission is granted. */ int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { return call_int_hook(ptrace_access_check, 0, child, mode); } /** * security_ptrace_traceme() - Check if tracing is allowed * @parent: tracing process * * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself to the * @parent process for tracing. * * Return: Returns 0 if permission is granted. */ int security_ptrace_traceme(struct task_struct *parent) { return call_int_hook(ptrace_traceme, 0, parent); } /** * security_capget() - Get the capability sets for a process * @target: target process * @effective: effective capability set * @inheritable: inheritable capability set * @permitted: permitted capability set * * Get the @effective, @inheritable, and @permitted capability sets for the * @target process. The hook may also perform permission checking to determine * if the current process is allowed to see the capability sets of the @target * process. * * Return: Returns 0 if the capability sets were successfully obtained. */ int security_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return call_int_hook(capget, 0, target, effective, inheritable, permitted); } /** * security_capset() - Set the capability sets for a process * @new: new credentials for the target process * @old: current credentials of the target process * @effective: effective capability set * @inheritable: inheritable capability set * @permitted: permitted capability set * * Set the @effective, @inheritable, and @permitted capability sets for the * current process. * * Return: Returns 0 and update @new if permission is granted. */ int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return call_int_hook(capset, 0, new, old, effective, inheritable, permitted); } /** * security_capable() - Check if a process has the necessary capability * @cred: credentials to examine * @ns: user namespace * @cap: capability requested * @opts: capability check options * * Check whether the @tsk process has the @cap capability in the indicated * credentials. @cap contains the capability <include/linux/capability.h>. * @opts contains options for the capable check <include/linux/security.h>. * * Return: Returns 0 if the capability is granted. */ int security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return call_int_hook(capable, 0, cred, ns, cap, opts); } /** * security_quotactl() - Check if a quotactl() syscall is allowed for this fs * @cmds: commands * @type: type * @id: id * @sb: filesystem * * Check whether the quotactl syscall is allowed for this @sb. * * Return: Returns 0 if permission is granted. */ int security_quotactl(int cmds, int type, int id, const struct super_block *sb) { return call_int_hook(quotactl, 0, cmds, type, id, sb); } /** * security_quota_on() - Check if QUOTAON is allowed for a dentry * @dentry: dentry * * Check whether QUOTAON is allowed for @dentry. * * Return: Returns 0 if permission is granted. */ int security_quota_on(struct dentry *dentry) { return call_int_hook(quota_on, 0, dentry); } /** * security_syslog() - Check if accessing the kernel message ring is allowed * @type: SYSLOG_ACTION_* type * * Check permission before accessing the kernel message ring or changing * logging to the console. See the syslog(2) manual page for an explanation of * the @type values. * * Return: Return 0 if permission is granted. */ int security_syslog(int type) { return call_int_hook(syslog, 0, type); } /** * security_settime64() - Check if changing the system time is allowed * @ts: new time * @tz: timezone * * Check permission to change the system time, struct timespec64 is defined in * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>. * * Return: Returns 0 if permission is granted. */ int security_settime64(const struct timespec64 *ts, const struct timezone *tz) { return call_int_hook(settime, 0, ts, tz); } /** * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed * @mm: mm struct * @pages: number of pages * * Check permissions for allocating a new virtual mapping. If all LSMs return * a positive value, __vm_enough_memory() will be called with cap_sys_admin * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be * called with cap_sys_admin cleared. * * Return: Returns 0 if permission is granted by the LSM infrastructure to the * caller. */ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { struct security_hook_list *hp; int cap_sys_admin = 1; int rc; /* * The module will respond with a positive value if * it thinks the __vm_enough_memory() call should be * made with the cap_sys_admin set. If all of the modules * agree that it should be set it will. If any module * thinks it should not be set it won't. */ hlist_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) { rc = hp->hook.vm_enough_memory(mm, pages); if (rc <= 0) { cap_sys_admin = 0; break; } } return __vm_enough_memory(mm, pages, cap_sys_admin); } /** * security_bprm_creds_for_exec() - Prepare the credentials for exec() * @bprm: binary program information * * If the setup in prepare_exec_creds did not setup @bprm->cred->security * properly for executing @bprm->file, update the LSM's portion of * @bprm->cred->security to be what commit_creds needs to install for the new * program. This hook may also optionally check permissions (e.g. for * transitions between security domains). The hook must set @bprm->secureexec * to 1 if AT_SECURE should be set to request libc enable secure mode. @bprm * contains the linux_binprm structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_creds_for_exec(struct linux_binprm *bprm) { return call_int_hook(bprm_creds_for_exec, 0, bprm); } /** * security_bprm_creds_from_file() - Update linux_binprm creds based on file * @bprm: binary program information * @file: associated file * * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon * exec, update @bprm->cred to reflect that change. This is called after * finding the binary that will be executed without an interpreter. This * ensures that the credentials will not be derived from a script that the * binary will need to reopen, which when reopend may end up being a completely * different file. This hook may also optionally check permissions (e.g. for * transitions between security domains). The hook must set @bprm->secureexec * to 1 if AT_SECURE should be set to request libc enable secure mode. The * hook must add to @bprm->per_clear any personality flags that should be * cleared from current->personality. @bprm contains the linux_binprm * structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file) { return call_int_hook(bprm_creds_from_file, 0, bprm, file); } /** * security_bprm_check() - Mediate binary handler search * @bprm: binary program information * * This hook mediates the point when a search for a binary handler will begin. * It allows a check against the @bprm->cred->security value which was set in * the preceding creds_for_exec call. The argv list and envp list are reliably * available in @bprm. This hook may be called multiple times during a single * execve. @bprm contains the linux_binprm structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_check(struct linux_binprm *bprm) { int ret; ret = call_int_hook(bprm_check_security, 0, bprm); if (ret) return ret; return ima_bprm_check(bprm); } /** * security_bprm_committing_creds() - Install creds for a process during exec() * @bprm: binary program information * * Prepare to install the new security attributes of a process being * transformed by an execve operation, based on the old credentials pointed to * by @current->cred and the information set in @bprm->cred by the * bprm_creds_for_exec hook. @bprm points to the linux_binprm structure. This * hook is a good place to perform state changes on the process such as closing * open file descriptors to which access will no longer be granted when the * attributes are changed. This is called immediately before commit_creds(). */ void security_bprm_committing_creds(const struct linux_binprm *bprm) { call_void_hook(bprm_committing_creds, bprm); } /** * security_bprm_committed_creds() - Tidy up after cred install during exec() * @bprm: binary program information * * Tidy up after the installation of the new security attributes of a process * being transformed by an execve operation. The new credentials have, by this * point, been set to @current->cred. @bprm points to the linux_binprm * structure. This hook is a good place to perform state changes on the * process such as clearing out non-inheritable signal state. This is called * immediately after commit_creds(). */ void security_bprm_committed_creds(const struct linux_binprm *bprm) { call_void_hook(bprm_committed_creds, bprm); } /** * security_fs_context_submount() - Initialise fc->security * @fc: new filesystem context * @reference: dentry reference for submount/remount * * Fill out the ->security field for a new fs_context. * * Return: Returns 0 on success or negative error code on failure. */ int security_fs_context_submount(struct fs_context *fc, struct super_block *reference) { return call_int_hook(fs_context_submount, 0, fc, reference); } /** * security_fs_context_dup() - Duplicate a fs_context LSM blob * @fc: destination filesystem context * @src_fc: source filesystem context * * Allocate and attach a security structure to sc->security. This pointer is * initialised to NULL by the caller. @fc indicates the new filesystem context. * @src_fc indicates the original filesystem context. * * Return: Returns 0 on success or a negative error code on failure. */ int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { return call_int_hook(fs_context_dup, 0, fc, src_fc); } /** * security_fs_context_parse_param() - Configure a filesystem context * @fc: filesystem context * @param: filesystem parameter * * Userspace provided a parameter to configure a superblock. The LSM can * consume the parameter or return it to the caller for use elsewhere. * * Return: If the parameter is used by the LSM it should return 0, if it is * returned to the caller -ENOPARAM is returned, otherwise a negative * error code is returned. */ int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct security_hook_list *hp; int trc; int rc = -ENOPARAM; hlist_for_each_entry(hp, &security_hook_heads.fs_context_parse_param, list) { trc = hp->hook.fs_context_parse_param(fc, param); if (trc == 0) rc = 0; else if (trc != -ENOPARAM) return trc; } return rc; } /** * security_sb_alloc() - Allocate a super_block LSM blob * @sb: filesystem superblock * * Allocate and attach a security structure to the sb->s_security field. The * s_security field is initialized to NULL when the structure is allocated. * @sb contains the super_block structure to be modified. * * Return: Returns 0 if operation was successful. */ int security_sb_alloc(struct super_block *sb) { int rc = lsm_superblock_alloc(sb); if (unlikely(rc)) return rc; rc = call_int_hook(sb_alloc_security, 0, sb); if (unlikely(rc)) security_sb_free(sb); return rc; } /** * security_sb_delete() - Release super_block LSM associated objects * @sb: filesystem superblock * * Release objects tied to a superblock (e.g. inodes). @sb contains the * super_block structure being released. */ void security_sb_delete(struct super_block *sb) { call_void_hook(sb_delete, sb); } /** * security_sb_free() - Free a super_block LSM blob * @sb: filesystem superblock * * Deallocate and clear the sb->s_security field. @sb contains the super_block * structure to be modified. */ void security_sb_free(struct super_block *sb) { call_void_hook(sb_free_security, sb); kfree(sb->s_security); sb->s_security = NULL; } /** * security_free_mnt_opts() - Free memory associated with mount options * @mnt_opts: LSM processed mount options * * Free memory associated with @mnt_ops. */ void security_free_mnt_opts(void **mnt_opts) { if (!*mnt_opts) return; call_void_hook(sb_free_mnt_opts, *mnt_opts); *mnt_opts = NULL; } EXPORT_SYMBOL(security_free_mnt_opts); /** * security_sb_eat_lsm_opts() - Consume LSM mount options * @options: mount options * @mnt_opts: LSM processed mount options * * Eat (scan @options) and save them in @mnt_opts. * * Return: Returns 0 on success, negative values on failure. */ int security_sb_eat_lsm_opts(char *options, void **mnt_opts) { return call_int_hook(sb_eat_lsm_opts, 0, options, mnt_opts); } EXPORT_SYMBOL(security_sb_eat_lsm_opts); /** * security_sb_mnt_opts_compat() - Check if new mount options are allowed * @sb: filesystem superblock * @mnt_opts: new mount options * * Determine if the new mount options in @mnt_opts are allowed given the * existing mounted filesystem at @sb. @sb superblock being compared. * * Return: Returns 0 if options are compatible. */ int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_mnt_opts_compat, 0, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_mnt_opts_compat); /** * security_sb_remount() - Verify no incompatible mount changes during remount * @sb: filesystem superblock * @mnt_opts: (re)mount options * * Extracts security system specific mount options and verifies no changes are * being made to those options. * * Return: Returns 0 if permission is granted. */ int security_sb_remount(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_remount, 0, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_remount); /** * security_sb_kern_mount() - Check if a kernel mount is allowed * @sb: filesystem superblock * * Mount this @sb if allowed by permissions. * * Return: Returns 0 if permission is granted. */ int security_sb_kern_mount(const struct super_block *sb) { return call_int_hook(sb_kern_mount, 0, sb); } /** * security_sb_show_options() - Output the mount options for a superblock * @m: output file * @sb: filesystem superblock * * Show (print on @m) mount options for this @sb. * * Return: Returns 0 on success, negative values on failure. */ int security_sb_show_options(struct seq_file *m, struct super_block *sb) { return call_int_hook(sb_show_options, 0, m, sb); } /** * security_sb_statfs() - Check if accessing fs stats is allowed * @dentry: superblock handle * * Check permission before obtaining filesystem statistics for the @mnt * mountpoint. @dentry is a handle on the superblock for the filesystem. * * Return: Returns 0 if permission is granted. */ int security_sb_statfs(struct dentry *dentry) { return call_int_hook(sb_statfs, 0, dentry); } /** * security_sb_mount() - Check permission for mounting a filesystem * @dev_name: filesystem backing device * @path: mount point * @type: filesystem type * @flags: mount flags * @data: filesystem specific data * * Check permission before an object specified by @dev_name is mounted on the * mount point named by @nd. For an ordinary mount, @dev_name identifies a * device if the file system type requires a device. For a remount * (@flags & MS_REMOUNT), @dev_name is irrelevant. For a loopback/bind mount * (@flags & MS_BIND), @dev_name identifies the pathname of the object being * mounted. * * Return: Returns 0 if permission is granted. */ int security_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data); } /** * security_sb_umount() - Check permission for unmounting a filesystem * @mnt: mounted filesystem * @flags: unmount flags * * Check permission before the @mnt file system is unmounted. * * Return: Returns 0 if permission is granted. */ int security_sb_umount(struct vfsmount *mnt, int flags) { return call_int_hook(sb_umount, 0, mnt, flags); } /** * security_sb_pivotroot() - Check permissions for pivoting the rootfs * @old_path: new location for current rootfs * @new_path: location of the new rootfs * * Check permission before pivoting the root filesystem. * * Return: Returns 0 if permission is granted. */ int security_sb_pivotroot(const struct path *old_path, const struct path *new_path) { return call_int_hook(sb_pivotroot, 0, old_path, new_path); } /** * security_sb_set_mnt_opts() - Set the mount options for a filesystem * @sb: filesystem superblock * @mnt_opts: binary mount options * @kern_flags: kernel flags (in) * @set_kern_flags: kernel flags (out) * * Set the security relevant mount options used for a superblock. * * Return: Returns 0 on success, error on failure. */ int security_sb_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { return call_int_hook(sb_set_mnt_opts, mnt_opts ? -EOPNOTSUPP : 0, sb, mnt_opts, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_set_mnt_opts); /** * security_sb_clone_mnt_opts() - Duplicate superblock mount options * @oldsb: source superblock * @newsb: destination superblock * @kern_flags: kernel flags (in) * @set_kern_flags: kernel flags (out) * * Copy all security options from a given superblock to another. * * Return: Returns 0 on success, error on failure. */ int security_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { return call_int_hook(sb_clone_mnt_opts, 0, oldsb, newsb, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_clone_mnt_opts); /** * security_move_mount() - Check permissions for moving a mount * @from_path: source mount point * @to_path: destination mount point * * Check permission before a mount is moved. * * Return: Returns 0 if permission is granted. */ int security_move_mount(const struct path *from_path, const struct path *to_path) { return call_int_hook(move_mount, 0, from_path, to_path); } /** * security_path_notify() - Check if setting a watch is allowed * @path: file path * @mask: event mask * @obj_type: file path type * * Check permissions before setting a watch on events as defined by @mask, on * an object at @path, whose type is defined by @obj_type. * * Return: Returns 0 if permission is granted. */ int security_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { return call_int_hook(path_notify, 0, path, mask, obj_type); } /** * security_inode_alloc() - Allocate an inode LSM blob * @inode: the inode * * Allocate and attach a security structure to @inode->i_security. The * i_security field is initialized to NULL when the inode structure is * allocated. * * Return: Return 0 if operation was successful. */ int security_inode_alloc(struct inode *inode) { int rc = lsm_inode_alloc(inode); if (unlikely(rc)) return rc; rc = call_int_hook(inode_alloc_security, 0, inode); if (unlikely(rc)) security_inode_free(inode); return rc; } static void inode_free_by_rcu(struct rcu_head *head) { /* * The rcu head is at the start of the inode blob */ kmem_cache_free(lsm_inode_cache, head); } /** * security_inode_free() - Free an inode's LSM blob * @inode: the inode * * Deallocate the inode security structure and set @inode->i_security to NULL. */ void security_inode_free(struct inode *inode) { integrity_inode_free(inode); call_void_hook(inode_free_security, inode); /* * The inode may still be referenced in a path walk and * a call to security_inode_permission() can be made * after inode_free_security() is called. Ideally, the VFS * wouldn't do this, but fixing that is a much harder * job. For now, simply free the i_security via RCU, and * leave the current inode->i_security pointer intact. * The inode will be freed after the RCU grace period too. */ if (inode->i_security) call_rcu((struct rcu_head *)inode->i_security, inode_free_by_rcu); } /** * security_dentry_init_security() - Perform dentry initialization * @dentry: the dentry to initialize * @mode: mode used to determine resource type * @name: name of the last path component * @xattr_name: name of the security/LSM xattr * @ctx: pointer to the resulting LSM context * @ctxlen: length of @ctx * * Compute a context for a dentry as the inode is not yet available since NFSv4 * has no label backed by an EA anyway. It is important to note that * @xattr_name does not need to be free'd by the caller, it is a static string. * * Return: Returns 0 on success, negative values on failure. */ int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, u32 *ctxlen) { struct security_hook_list *hp; int rc; /* * Only one module will provide a security context. */ hlist_for_each_entry(hp, &security_hook_heads.dentry_init_security, list) { rc = hp->hook.dentry_init_security(dentry, mode, name, xattr_name, ctx, ctxlen); if (rc != LSM_RET_DEFAULT(dentry_init_security)) return rc; } return LSM_RET_DEFAULT(dentry_init_security); } EXPORT_SYMBOL(security_dentry_init_security); /** * security_dentry_create_files_as() - Perform dentry initialization * @dentry: the dentry to initialize * @mode: mode used to determine resource type * @name: name of the last path component * @old: creds to use for LSM context calculations * @new: creds to modify * * Compute a context for a dentry as the inode is not yet available and set * that context in passed in creds so that new files are created using that * context. Context is calculated using the passed in creds and not the creds * of the caller. * * Return: Returns 0 on success, error on failure. */ int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { return call_int_hook(dentry_create_files_as, 0, dentry, mode, name, old, new); } EXPORT_SYMBOL(security_dentry_create_files_as); /** * security_inode_init_security() - Initialize an inode's LSM context * @inode: the inode * @dir: parent directory * @qstr: last component of the pathname * @initxattrs: callback function to write xattrs * @fs_data: filesystem specific data * * Obtain the security attribute name suffix and value to set on a newly * created inode and set up the incore security field for the new inode. This * hook is called by the fs code as part of the inode creation transaction and * provides for atomic labeling of the inode, unlike the post_create/mkdir/... * hooks called by the VFS. * * The hook function is expected to populate the xattrs array, by calling * lsm_get_xattr_slot() to retrieve the slots reserved by the security module * with the lbs_xattr_count field of the lsm_blob_sizes structure. For each * slot, the hook function should set ->name to the attribute name suffix * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it * to the attribute value, to set ->value_len to the length of the value. If * the security module does not use security attributes or does not wish to put * a security attribute on this particular inode, then it should return * -EOPNOTSUPP to skip this processing. * * Return: Returns 0 if the LSM successfully initialized all of the inode * security attributes that are required, negative values otherwise. */ int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const initxattrs initxattrs, void *fs_data) { struct security_hook_list *hp; struct xattr *new_xattrs = NULL; int ret = -EOPNOTSUPP, xattr_count = 0; if (unlikely(IS_PRIVATE(inode))) return 0; if (!blob_sizes.lbs_xattr_count) return 0; if (initxattrs) { /* Allocate +1 for EVM and +1 as terminator. */ new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 2, sizeof(*new_xattrs), GFP_NOFS); if (!new_xattrs) return -ENOMEM; } hlist_for_each_entry(hp, &security_hook_heads.inode_init_security, list) { ret = hp->hook.inode_init_security(inode, dir, qstr, new_xattrs, &xattr_count); if (ret && ret != -EOPNOTSUPP) goto out; /* * As documented in lsm_hooks.h, -EOPNOTSUPP in this context * means that the LSM is not willing to provide an xattr, not * that it wants to signal an error. Thus, continue to invoke * the remaining LSMs. */ } /* If initxattrs() is NULL, xattr_count is zero, skip the call. */ if (!xattr_count) goto out; ret = evm_inode_init_security(inode, dir, qstr, new_xattrs, &xattr_count); if (ret) goto out; ret = initxattrs(inode, new_xattrs, fs_data); out: for (; xattr_count > 0; xattr_count--) kfree(new_xattrs[xattr_count - 1].value); kfree(new_xattrs); return (ret == -EOPNOTSUPP) ? 0 : ret; } EXPORT_SYMBOL(security_inode_init_security); /** * security_inode_init_security_anon() - Initialize an anonymous inode * @inode: the inode * @name: the anonymous inode class * @context_inode: an optional related inode * * Set up the incore security field for the new anonymous inode and return * whether the inode creation is permitted by the security module or not. * * Return: Returns 0 on success, -EACCES if the security module denies the * creation of this inode, or another -errno upon other errors. */ int security_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode) { return call_int_hook(inode_init_security_anon, 0, inode, name, context_inode); } #ifdef CONFIG_SECURITY_PATH /** * security_path_mknod() - Check if creating a special file is allowed * @dir: parent directory * @dentry: new file * @mode: new file mode * @dev: device number * * Check permissions when creating a file. Note that this hook is called even * if mknod operation is being done for a regular file. * * Return: Returns 0 if permission is granted. */ int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mknod, 0, dir, dentry, mode, dev); } EXPORT_SYMBOL(security_path_mknod); /** * security_path_mkdir() - Check if creating a new directory is allowed * @dir: parent directory * @dentry: new directory * @mode: new directory mode * * Check permissions to create a new directory in the existing directory. * * Return: Returns 0 if permission is granted. */ int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mkdir, 0, dir, dentry, mode); } EXPORT_SYMBOL(security_path_mkdir); /** * security_path_rmdir() - Check if removing a directory is allowed * @dir: parent directory * @dentry: directory to remove * * Check the permission to remove a directory. * * Return: Returns 0 if permission is granted. */ int security_path_rmdir(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_rmdir, 0, dir, dentry); } /** * security_path_unlink() - Check if removing a hard link is allowed * @dir: parent directory * @dentry: file * * Check the permission to remove a hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_unlink(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_unlink, 0, dir, dentry); } EXPORT_SYMBOL(security_path_unlink); /** * security_path_symlink() - Check if creating a symbolic link is allowed * @dir: parent directory * @dentry: symbolic link * @old_name: file pathname * * Check the permission to create a symbolic link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_symlink, 0, dir, dentry, old_name); } /** * security_path_link - Check if creating a hard link is allowed * @old_dentry: existing file * @new_dir: new parent directory * @new_dentry: new link * * Check permission before creating a new hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry); } /** * security_path_rename() - Check if renaming a file is allowed * @old_dir: parent directory of the old file * @old_dentry: the old file * @new_dir: parent directory of the new file * @new_dentry: the new file * @flags: flags * * Check for permission to rename a file or directory. * * Return: Returns 0 if permission is granted. */ int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; return call_int_hook(path_rename, 0, old_dir, old_dentry, new_dir, new_dentry, flags); } EXPORT_SYMBOL(security_path_rename); /** * security_path_truncate() - Check if truncating a file is allowed * @path: file * * Check permission before truncating the file indicated by path. Note that * truncation permissions may also be checked based on already opened files, * using the security_file_truncate() hook. * * Return: Returns 0 if permission is granted. */ int security_path_truncate(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_truncate, 0, path); } /** * security_path_chmod() - Check if changing the file's mode is allowed * @path: file * @mode: new mode * * Check for permission to change a mode of the file @path. The new mode is * specified in @mode which is a bitmask of constants from * <include/uapi/linux/stat.h>. * * Return: Returns 0 if permission is granted. */ int security_path_chmod(const struct path *path, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chmod, 0, path, mode); } /** * security_path_chown() - Check if changing the file's owner/group is allowed * @path: file * @uid: file owner * @gid: file group * * Check for permission to change owner/group of a file or directory. * * Return: Returns 0 if permission is granted. */ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chown, 0, path, uid, gid); } /** * security_path_chroot() - Check if changing the root directory is allowed * @path: directory * * Check for permission to change root directory. * * Return: Returns 0 if permission is granted. */ int security_path_chroot(const struct path *path) { return call_int_hook(path_chroot, 0, path); } #endif /* CONFIG_SECURITY_PATH */ /** * security_inode_create() - Check if creating a file is allowed * @dir: the parent directory * @dentry: the file being created * @mode: requested file mode * * Check permission to create a regular file. * * Return: Returns 0 if permission is granted. */ int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_create, 0, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_create); /** * security_inode_link() - Check if creating a hard link is allowed * @old_dentry: existing file * @dir: new parent directory * @new_dentry: new link * * Check permission before creating a new hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(inode_link, 0, old_dentry, dir, new_dentry); } /** * security_inode_unlink() - Check if removing a hard link is allowed * @dir: parent directory * @dentry: file * * Check the permission to remove a hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_unlink(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_unlink, 0, dir, dentry); } /** * security_inode_symlink() - Check if creating a symbolic link is allowed * @dir: parent directory * @dentry: symbolic link * @old_name: existing filename * * Check the permission to create a symbolic link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_symlink(struct inode *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_symlink, 0, dir, dentry, old_name); } /** * security_inode_mkdir() - Check if creation a new director is allowed * @dir: parent directory * @dentry: new directory * @mode: new directory mode * * Check permissions to create a new directory in the existing directory * associated with inode structure @dir. * * Return: Returns 0 if permission is granted. */ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mkdir, 0, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_mkdir); /** * security_inode_rmdir() - Check if removing a directory is allowed * @dir: parent directory * @dentry: directory to be removed * * Check the permission to remove a directory. * * Return: Returns 0 if permission is granted. */ int security_inode_rmdir(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_rmdir, 0, dir, dentry); } /** * security_inode_mknod() - Check if creating a special file is allowed * @dir: parent directory * @dentry: new file * @mode: new file mode * @dev: device number * * Check permissions when creating a special file (or a socket or a fifo file * created via the mknod system call). Note that if mknod operation is being * done for a regular file, then the create hook will be called and not this * hook. * * Return: Returns 0 if permission is granted. */ int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mknod, 0, dir, dentry, mode, dev); } /** * security_inode_rename() - Check if renaming a file is allowed * @old_dir: parent directory of the old file * @old_dentry: the old file * @new_dir: parent directory of the new file * @new_dentry: the new file * @flags: flags * * Check for permission to rename a file or directory. * * Return: Returns 0 if permission is granted. */ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { int err = call_int_hook(inode_rename, 0, new_dir, new_dentry, old_dir, old_dentry); if (err) return err; } return call_int_hook(inode_rename, 0, old_dir, old_dentry, new_dir, new_dentry); } /** * security_inode_readlink() - Check if reading a symbolic link is allowed * @dentry: link * * Check the permission to read the symbolic link. * * Return: Returns 0 if permission is granted. */ int security_inode_readlink(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_readlink, 0, dentry); } /** * security_inode_follow_link() - Check if following a symbolic link is allowed * @dentry: link dentry * @inode: link inode * @rcu: true if in RCU-walk mode * * Check permission to follow a symbolic link when looking up a pathname. If * @rcu is true, @inode is not stable. * * Return: Returns 0 if permission is granted. */ int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_follow_link, 0, dentry, inode, rcu); } /** * security_inode_permission() - Check if accessing an inode is allowed * @inode: inode * @mask: access mask * * Check permission before accessing an inode. This hook is called by the * existing Linux permission function, so a security module can use it to * provide additional checking for existing Linux permission checks. Notice * that this hook is called when a file is opened (as well as many other * operations), whereas the file_security_ops permission hook is called when * the actual read/write operations are performed. * * Return: Returns 0 if permission is granted. */ int security_inode_permission(struct inode *inode, int mask) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_permission, 0, inode, mask); } /** * security_inode_setattr() - Check if setting file attributes is allowed * @idmap: idmap of the mount * @dentry: file * @attr: new attributes * * Check permission before setting file attributes. Note that the kernel call * to notify_change is performed from several locations, whenever file * attributes change (such as when a file is truncated, chown/chmod operations, * transferring disk quotas, etc). * * Return: Returns 0 if permission is granted. */ int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = call_int_hook(inode_setattr, 0, dentry, attr); if (ret) return ret; return evm_inode_setattr(idmap, dentry, attr); } EXPORT_SYMBOL_GPL(security_inode_setattr); /** * security_inode_getattr() - Check if getting file attributes is allowed * @path: file * * Check permission before obtaining file attributes. * * Return: Returns 0 if permission is granted. */ int security_inode_getattr(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(inode_getattr, 0, path); } /** * security_inode_setxattr() - Check if setting file xattrs is allowed * @idmap: idmap of the mount * @dentry: file * @name: xattr name * @value: xattr value * @size: size of xattr value * @flags: flags * * Check permission before setting the extended attributes. * * Return: Returns 0 if permission is granted. */ int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ ret = call_int_hook(inode_setxattr, 1, idmap, dentry, name, value, size, flags); if (ret == 1) ret = cap_inode_setxattr(dentry, name, value, size, flags); if (ret) return ret; ret = ima_inode_setxattr(dentry, name, value, size); if (ret) return ret; return evm_inode_setxattr(idmap, dentry, name, value, size); } /** * security_inode_set_acl() - Check if setting posix acls is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * @kacl: acl struct * * Check permission before setting posix acls, the posix acls in @kacl are * identified by @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = call_int_hook(inode_set_acl, 0, idmap, dentry, acl_name, kacl); if (ret) return ret; ret = ima_inode_set_acl(idmap, dentry, acl_name, kacl); if (ret) return ret; return evm_inode_set_acl(idmap, dentry, acl_name, kacl); } /** * security_inode_get_acl() - Check if reading posix acls is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Check permission before getting osix acls, the posix acls are identified by * @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_get_acl, 0, idmap, dentry, acl_name); } /** * security_inode_remove_acl() - Check if removing a posix acl is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Check permission before removing posix acls, the posix acls are identified * by @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = call_int_hook(inode_remove_acl, 0, idmap, dentry, acl_name); if (ret) return ret; ret = ima_inode_remove_acl(idmap, dentry, acl_name); if (ret) return ret; return evm_inode_remove_acl(idmap, dentry, acl_name); } /** * security_inode_post_setxattr() - Update the inode after a setxattr operation * @dentry: file * @name: xattr name * @value: xattr value * @size: xattr value size * @flags: flags * * Update inode security field after successful setxattr operation. */ void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_setxattr, dentry, name, value, size, flags); evm_inode_post_setxattr(dentry, name, value, size); } /** * security_inode_getxattr() - Check if xattr access is allowed * @dentry: file * @name: xattr name * * Check permission before obtaining the extended attributes identified by * @name for @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_getxattr(struct dentry *dentry, const char *name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_getxattr, 0, dentry, name); } /** * security_inode_listxattr() - Check if listing xattrs is allowed * @dentry: file * * Check permission before obtaining the list of extended attribute names for * @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_listxattr(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_listxattr, 0, dentry); } /** * security_inode_removexattr() - Check if removing an xattr is allowed * @idmap: idmap of the mount * @dentry: file * @name: xattr name * * Check permission before removing the extended attribute identified by @name * for @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ ret = call_int_hook(inode_removexattr, 1, idmap, dentry, name); if (ret == 1) ret = cap_inode_removexattr(idmap, dentry, name); if (ret) return ret; ret = ima_inode_removexattr(dentry, name); if (ret) return ret; return evm_inode_removexattr(idmap, dentry, name); } /** * security_inode_need_killpriv() - Check if security_inode_killpriv() required * @dentry: associated dentry * * Called when an inode has been changed to determine if * security_inode_killpriv() should be called. * * Return: Return <0 on error to abort the inode change operation, return 0 if * security_inode_killpriv() does not need to be called, return >0 if * security_inode_killpriv() does need to be called. */ int security_inode_need_killpriv(struct dentry *dentry) { return call_int_hook(inode_need_killpriv, 0, dentry); } /** * security_inode_killpriv() - The setuid bit is removed, update LSM state * @idmap: idmap of the mount * @dentry: associated dentry * * The @dentry's setuid bit is being removed. Remove similar security labels. * Called with the dentry->d_inode->i_mutex held. * * Return: Return 0 on success. If error is returned, then the operation * causing setuid bit removal is failed. */ int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { return call_int_hook(inode_killpriv, 0, idmap, dentry); } /** * security_inode_getsecurity() - Get the xattr security label of an inode * @idmap: idmap of the mount * @inode: inode * @name: xattr name * @buffer: security label buffer * @alloc: allocation flag * * Retrieve a copy of the extended attribute representation of the security * label associated with @name for @inode via @buffer. Note that @name is the * remainder of the attribute name after the security prefix has been removed. * @alloc is used to specify if the call should return a value via the buffer * or just the value length. * * Return: Returns size of buffer on success. */ int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { struct security_hook_list *hp; int rc; if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_getsecurity); /* * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) { rc = hp->hook.inode_getsecurity(idmap, inode, name, buffer, alloc); if (rc != LSM_RET_DEFAULT(inode_getsecurity)) return rc; } return LSM_RET_DEFAULT(inode_getsecurity); } /** * security_inode_setsecurity() - Set the xattr security label of an inode * @inode: inode * @name: xattr name * @value: security label * @size: length of security label * @flags: flags * * Set the security label associated with @name for @inode from the extended * attribute value @value. @size indicates the size of the @value in bytes. * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the * remainder of the attribute name after the security. prefix has been removed. * * Return: Returns 0 on success. */ int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct security_hook_list *hp; int rc; if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_setsecurity); /* * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_setsecurity, list) { rc = hp->hook.inode_setsecurity(inode, name, value, size, flags); if (rc != LSM_RET_DEFAULT(inode_setsecurity)) return rc; } return LSM_RET_DEFAULT(inode_setsecurity); } /** * security_inode_listsecurity() - List the xattr security label names * @inode: inode * @buffer: buffer * @buffer_size: size of buffer * * Copy the extended attribute names for the security labels associated with * @inode into @buffer. The maximum size of @buffer is specified by * @buffer_size. @buffer may be NULL to request the size of the buffer * required. * * Return: Returns number of bytes used/required on success. */ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_listsecurity, 0, inode, buffer, buffer_size); } EXPORT_SYMBOL(security_inode_listsecurity); /** * security_inode_getsecid() - Get an inode's secid * @inode: inode * @secid: secid to return * * Get the secid associated with the node. In case of failure, @secid will be * set to zero. */ void security_inode_getsecid(struct inode *inode, u32 *secid) { call_void_hook(inode_getsecid, inode, secid); } /** * security_inode_copy_up() - Create new creds for an overlayfs copy-up op * @src: union dentry of copy-up file * @new: newly created creds * * A file is about to be copied up from lower layer to upper layer of overlay * filesystem. Security module can prepare a set of new creds and modify as * need be and return new creds. Caller will switch to new creds temporarily to * create new file and release newly allocated creds. * * Return: Returns 0 on success or a negative error code on error. */ int security_inode_copy_up(struct dentry *src, struct cred **new) { return call_int_hook(inode_copy_up, 0, src, new); } EXPORT_SYMBOL(security_inode_copy_up); /** * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op * @name: xattr name * * Filter the xattrs being copied up when a unioned file is copied up from a * lower layer to the union/overlay layer. The caller is responsible for * reading and writing the xattrs, this hook is merely a filter. * * Return: Returns 0 to accept the xattr, 1 to discard the xattr, -EOPNOTSUPP * if the security module does not know about attribute, or a negative * error code to abort the copy up. */ int security_inode_copy_up_xattr(const char *name) { struct security_hook_list *hp; int rc; /* * The implementation can return 0 (accept the xattr), 1 (discard the * xattr), -EOPNOTSUPP if it does not know anything about the xattr or * any other error code in case of an error. */ hlist_for_each_entry(hp, &security_hook_heads.inode_copy_up_xattr, list) { rc = hp->hook.inode_copy_up_xattr(name); if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr)) return rc; } return evm_inode_copy_up_xattr(name); } EXPORT_SYMBOL(security_inode_copy_up_xattr); /** * security_kernfs_init_security() - Init LSM context for a kernfs node * @kn_dir: parent kernfs node * @kn: the kernfs node to initialize * * Initialize the security context of a newly created kernfs node based on its * own and its parent's attributes. * * Return: Returns 0 if permission is granted. */ int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { return call_int_hook(kernfs_init_security, 0, kn_dir, kn); } /** * security_file_permission() - Check file permissions * @file: file * @mask: requested permissions * * Check file permissions before accessing an open file. This hook is called * by various operations that read or write files. A security module can use * this hook to perform additional checking on these operations, e.g. to * revalidate permissions on use to support privilege bracketing or policy * changes. Notice that this hook is used when the actual read/write * operations are performed, whereas the inode_security_ops hook is called when * a file is opened (as well as many other operations). Although this hook can * be used to revalidate permissions for various system call operations that * read or write files, it does not address the revalidation of permissions for * memory-mapped files. Security modules must handle this separately if they * need such revalidation. * * Return: Returns 0 if permission is granted. */ int security_file_permission(struct file *file, int mask) { return call_int_hook(file_permission, 0, file, mask); } /** * security_file_alloc() - Allocate and init a file's LSM blob * @file: the file * * Allocate and attach a security structure to the file->f_security field. The * security field is initialized to NULL when the structure is first created. * * Return: Return 0 if the hook is successful and permission is granted. */ int security_file_alloc(struct file *file) { int rc = lsm_file_alloc(file); if (rc) return rc; rc = call_int_hook(file_alloc_security, 0, file); if (unlikely(rc)) security_file_free(file); return rc; } /** * security_file_free() - Free a file's LSM blob * @file: the file * * Deallocate and free any security structures stored in file->f_security. */ void security_file_free(struct file *file) { void *blob; call_void_hook(file_free_security, file); blob = file->f_security; if (blob) { file->f_security = NULL; kmem_cache_free(lsm_file_cache, blob); } } /** * security_file_ioctl() - Check if an ioctl is allowed * @file: associated file * @cmd: ioctl cmd * @arg: ioctl arguments * * Check permission for an ioctl operation on @file. Note that @arg sometimes * represents a user space pointer; in other cases, it may be a simple integer * value. When @arg represents a user space pointer, it should never be used * by the security module. * * Return: Returns 0 if permission is granted. */ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl, 0, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl); /** * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode * @file: associated file * @cmd: ioctl cmd * @arg: ioctl arguments * * Compat version of security_file_ioctl() that correctly handles 32-bit * processes running on 64-bit kernels. * * Return: Returns 0 if permission is granted. */ int security_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl_compat, 0, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl_compat); static inline unsigned long mmap_prot(struct file *file, unsigned long prot) { /* * Does we have PROT_READ and does the application expect * it to imply PROT_EXEC? If not, nothing to talk about... */ if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ) return prot; if (!(current->personality & READ_IMPLIES_EXEC)) return prot; /* * if that's an anonymous mapping, let it. */ if (!file) return prot | PROT_EXEC; /* * ditto if it's not on noexec mount, except that on !MMU we need * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case */ if (!path_noexec(&file->f_path)) { #ifndef CONFIG_MMU if (file->f_op->mmap_capabilities) { unsigned caps = file->f_op->mmap_capabilities(file); if (!(caps & NOMMU_MAP_EXEC)) return prot; } #endif return prot | PROT_EXEC; } /* anything on noexec mount won't get PROT_EXEC */ return prot; } /** * security_mmap_file() - Check if mmap'ing a file is allowed * @file: file * @prot: protection applied by the kernel * @flags: flags * * Check permissions for a mmap operation. The @file may be NULL, e.g. if * mapping anonymous memory. * * Return: Returns 0 if permission is granted. */ int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags) { unsigned long prot_adj = mmap_prot(file, prot); int ret; ret = call_int_hook(mmap_file, 0, file, prot, prot_adj, flags); if (ret) return ret; return ima_file_mmap(file, prot, prot_adj, flags); } /** * security_mmap_addr() - Check if mmap'ing an address is allowed * @addr: address * * Check permissions for a mmap operation at @addr. * * Return: Returns 0 if permission is granted. */ int security_mmap_addr(unsigned long addr) { return call_int_hook(mmap_addr, 0, addr); } /** * security_file_mprotect() - Check if changing memory protections is allowed * @vma: memory region * @reqprot: application requested protection * @prot: protection applied by the kernel * * Check permissions before changing memory access permissions. * * Return: Returns 0 if permission is granted. */ int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { int ret; ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot); if (ret) return ret; return ima_file_mprotect(vma, prot); } /** * security_file_lock() - Check if a file lock is allowed * @file: file * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK) * * Check permission before performing file locking operations. Note the hook * mediates both flock and fcntl style locks. * * Return: Returns 0 if permission is granted. */ int security_file_lock(struct file *file, unsigned int cmd) { return call_int_hook(file_lock, 0, file, cmd); } /** * security_file_fcntl() - Check if fcntl() op is allowed * @file: file * @cmd: fcntl command * @arg: command argument * * Check permission before allowing the file operation specified by @cmd from * being performed on the file @file. Note that @arg sometimes represents a * user space pointer; in other cases, it may be a simple integer value. When * @arg represents a user space pointer, it should never be used by the * security module. * * Return: Returns 0 if permission is granted. */ int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_fcntl, 0, file, cmd, arg); } /** * security_file_set_fowner() - Set the file owner info in the LSM blob * @file: the file * * Save owner security information (typically from current->security) in * file->f_security for later use by the send_sigiotask hook. * * Return: Returns 0 on success. */ void security_file_set_fowner(struct file *file) { call_void_hook(file_set_fowner, file); } /** * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed * @tsk: target task * @fown: signal sender * @sig: signal to be sent, SIGIO is sent if 0 * * Check permission for the file owner @fown to send SIGIO or SIGURG to the * process @tsk. Note that this hook is sometimes called from interrupt. Note * that the fown_struct, @fown, is never outside the context of a struct file, * so the file structure (and associated security information) can always be * obtained: container_of(fown, struct file, f_owner). * * Return: Returns 0 if permission is granted. */ int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig) { return call_int_hook(file_send_sigiotask, 0, tsk, fown, sig); } /** * security_file_receive() - Check is receiving a file via IPC is allowed * @file: file being received * * This hook allows security modules to control the ability of a process to * receive an open file descriptor via socket IPC. * * Return: Returns 0 if permission is granted. */ int security_file_receive(struct file *file) { return call_int_hook(file_receive, 0, file); } /** * security_file_open() - Save open() time state for late use by the LSM * @file: * * Save open-time permission checking state for later use upon file_permission, * and recheck access if anything has changed since inode_permission. * * Return: Returns 0 if permission is granted. */ int security_file_open(struct file *file) { int ret; ret = call_int_hook(file_open, 0, file); if (ret) return ret; return fsnotify_open_perm(file); } /** * security_file_truncate() - Check if truncating a file is allowed * @file: file * * Check permission before truncating a file, i.e. using ftruncate. Note that * truncation permission may also be checked based on the path, using the * @path_truncate hook. * * Return: Returns 0 if permission is granted. */ int security_file_truncate(struct file *file) { return call_int_hook(file_truncate, 0, file); } /** * security_task_alloc() - Allocate a task's LSM blob * @task: the task * @clone_flags: flags indicating what is being shared * * Handle allocation of task-related resources. * * Return: Returns a zero on success, negative values on failure. */ int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { int rc = lsm_task_alloc(task); if (rc) return rc; rc = call_int_hook(task_alloc, 0, task, clone_flags); if (unlikely(rc)) security_task_free(task); return rc; } /** * security_task_free() - Free a task's LSM blob and related resources * @task: task * * Handle release of task-related resources. Note that this can be called from * interrupt context. */ void security_task_free(struct task_struct *task) { call_void_hook(task_free, task); kfree(task->security); task->security = NULL; } /** * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer * @cred: credentials * @gfp: gfp flags * * Only allocate sufficient memory and attach to @cred such that * cred_transfer() will not get ENOMEM. * * Return: Returns 0 on success, negative values on failure. */ int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) { int rc = lsm_cred_alloc(cred, gfp); if (rc) return rc; rc = call_int_hook(cred_alloc_blank, 0, cred, gfp); if (unlikely(rc)) security_cred_free(cred); return rc; } /** * security_cred_free() - Free the cred's LSM blob and associated resources * @cred: credentials * * Deallocate and clear the cred->security field in a set of credentials. */ void security_cred_free(struct cred *cred) { /* * There is a failure case in prepare_creds() that * may result in a call here with ->security being NULL. */ if (unlikely(cred->security == NULL)) return; call_void_hook(cred_free, cred); kfree(cred->security); cred->security = NULL; } /** * security_prepare_creds() - Prepare a new set of credentials * @new: new credentials * @old: original credentials * @gfp: gfp flags * * Prepare a new set of credentials by copying the data from the old set. * * Return: Returns 0 on success, negative values on failure. */ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) { int rc = lsm_cred_alloc(new, gfp); if (rc) return rc; rc = call_int_hook(cred_prepare, 0, new, old, gfp); if (unlikely(rc)) security_cred_free(new); return rc; } /** * security_transfer_creds() - Transfer creds * @new: target credentials * @old: original credentials * * Transfer data from original creds to new creds. */ void security_transfer_creds(struct cred *new, const struct cred *old) { call_void_hook(cred_transfer, new, old); } /** * security_cred_getsecid() - Get the secid from a set of credentials * @c: credentials * @secid: secid value * * Retrieve the security identifier of the cred structure @c. In case of * failure, @secid will be set to zero. */ void security_cred_getsecid(const struct cred *c, u32 *secid) { *secid = 0; call_void_hook(cred_getsecid, c, secid); } EXPORT_SYMBOL(security_cred_getsecid); /** * security_kernel_act_as() - Set the kernel credentials to act as secid * @new: credentials * @secid: secid * * Set the credentials for a kernel service to act as (subjective context). * The current task must be the one that nominated @secid. * * Return: Returns 0 if successful. */ int security_kernel_act_as(struct cred *new, u32 secid) { return call_int_hook(kernel_act_as, 0, new, secid); } /** * security_kernel_create_files_as() - Set file creation context using an inode * @new: target credentials * @inode: reference inode * * Set the file creation context in a set of credentials to be the same as the * objective context of the specified inode. The current task must be the one * that nominated @inode. * * Return: Returns 0 if successful. */ int security_kernel_create_files_as(struct cred *new, struct inode *inode) { return call_int_hook(kernel_create_files_as, 0, new, inode); } /** * security_kernel_module_request() - Check is loading a module is allowed * @kmod_name: module name * * Ability to trigger the kernel to automatically upcall to userspace for * userspace to load a kernel module with the given name. * * Return: Returns 0 if successful. */ int security_kernel_module_request(char *kmod_name) { int ret; ret = call_int_hook(kernel_module_request, 0, kmod_name); if (ret) return ret; return integrity_kernel_module_request(kmod_name); } /** * security_kernel_read_file() - Read a file specified by userspace * @file: file * @id: file identifier * @contents: trust if security_kernel_post_read_file() will be called * * Read a file specified by userspace. * * Return: Returns 0 if permission is granted. */ int security_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { int ret; ret = call_int_hook(kernel_read_file, 0, file, id, contents); if (ret) return ret; return ima_read_file(file, id, contents); } EXPORT_SYMBOL_GPL(security_kernel_read_file); /** * security_kernel_post_read_file() - Read a file specified by userspace * @file: file * @buf: file contents * @size: size of file contents * @id: file identifier * * Read a file specified by userspace. This must be paired with a prior call * to security_kernel_read_file() call that indicated this hook would also be * called, see security_kernel_read_file() for more information. * * Return: Returns 0 if permission is granted. */ int security_kernel_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) { int ret; ret = call_int_hook(kernel_post_read_file, 0, file, buf, size, id); if (ret) return ret; return ima_post_read_file(file, buf, size, id); } EXPORT_SYMBOL_GPL(security_kernel_post_read_file); /** * security_kernel_load_data() - Load data provided by userspace * @id: data identifier * @contents: true if security_kernel_post_load_data() will be called * * Load data provided by userspace. * * Return: Returns 0 if permission is granted. */ int security_kernel_load_data(enum kernel_load_data_id id, bool contents) { int ret; ret = call_int_hook(kernel_load_data, 0, id, contents); if (ret) return ret; return ima_load_data(id, contents); } EXPORT_SYMBOL_GPL(security_kernel_load_data); /** * security_kernel_post_load_data() - Load userspace data from a non-file source * @buf: data * @size: size of data * @id: data identifier * @description: text description of data, specific to the id value * * Load data provided by a non-file source (usually userspace buffer). This * must be paired with a prior security_kernel_load_data() call that indicated * this hook would also be called, see security_kernel_load_data() for more * information. * * Return: Returns 0 if permission is granted. */ int security_kernel_post_load_data(char *buf, loff_t size, enum kernel_load_data_id id, char *description) { int ret; ret = call_int_hook(kernel_post_load_data, 0, buf, size, id, description); if (ret) return ret; return ima_post_load_data(buf, size, id, description); } EXPORT_SYMBOL_GPL(security_kernel_post_load_data); /** * security_task_fix_setuid() - Update LSM with new user id attributes * @new: updated credentials * @old: credentials being replaced * @flags: LSM_SETID_* flag values * * Update the module's state after setting one or more of the user identity * attributes of the current process. The @flags parameter indicates which of * the set*uid system calls invoked this hook. If @new is the set of * credentials that will be installed. Modifications should be made to this * rather than to @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setuid, 0, new, old, flags); } /** * security_task_fix_setgid() - Update LSM with new group id attributes * @new: updated credentials * @old: credentials being replaced * @flags: LSM_SETID_* flag value * * Update the module's state after setting one or more of the group identity * attributes of the current process. The @flags parameter indicates which of * the set*gid system calls invoked this hook. @new is the set of credentials * that will be installed. Modifications should be made to this rather than to * @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setgid, 0, new, old, flags); } /** * security_task_fix_setgroups() - Update LSM with new supplementary groups * @new: updated credentials * @old: credentials being replaced * * Update the module's state after setting the supplementary group identity * attributes of the current process. @new is the set of credentials that will * be installed. Modifications should be made to this rather than to * @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setgroups(struct cred *new, const struct cred *old) { return call_int_hook(task_fix_setgroups, 0, new, old); } /** * security_task_setpgid() - Check if setting the pgid is allowed * @p: task being modified * @pgid: new pgid * * Check permission before setting the process group identifier of the process * @p to @pgid. * * Return: Returns 0 if permission is granted. */ int security_task_setpgid(struct task_struct *p, pid_t pgid) { return call_int_hook(task_setpgid, 0, p, pgid); } /** * security_task_getpgid() - Check if getting the pgid is allowed * @p: task * * Check permission before getting the process group identifier of the process * @p. * * Return: Returns 0 if permission is granted. */ int security_task_getpgid(struct task_struct *p) { return call_int_hook(task_getpgid, 0, p); } /** * security_task_getsid() - Check if getting the session id is allowed * @p: task * * Check permission before getting the session identifier of the process @p. * * Return: Returns 0 if permission is granted. */ int security_task_getsid(struct task_struct *p) { return call_int_hook(task_getsid, 0, p); } /** * security_current_getsecid_subj() - Get the current task's subjective secid * @secid: secid value * * Retrieve the subjective security identifier of the current task and return * it in @secid. In case of failure, @secid will be set to zero. */ void security_current_getsecid_subj(u32 *secid) { *secid = 0; call_void_hook(current_getsecid_subj, secid); } EXPORT_SYMBOL(security_current_getsecid_subj); /** * security_task_getsecid_obj() - Get a task's objective secid * @p: target task * @secid: secid value * * Retrieve the objective security identifier of the task_struct in @p and * return it in @secid. In case of failure, @secid will be set to zero. */ void security_task_getsecid_obj(struct task_struct *p, u32 *secid) { *secid = 0; call_void_hook(task_getsecid_obj, p, secid); } EXPORT_SYMBOL(security_task_getsecid_obj); /** * security_task_setnice() - Check if setting a task's nice value is allowed * @p: target task * @nice: nice value * * Check permission before setting the nice value of @p to @nice. * * Return: Returns 0 if permission is granted. */ int security_task_setnice(struct task_struct *p, int nice) { return call_int_hook(task_setnice, 0, p, nice); } /** * security_task_setioprio() - Check if setting a task's ioprio is allowed * @p: target task * @ioprio: ioprio value * * Check permission before setting the ioprio value of @p to @ioprio. * * Return: Returns 0 if permission is granted. */ int security_task_setioprio(struct task_struct *p, int ioprio) { return call_int_hook(task_setioprio, 0, p, ioprio); } /** * security_task_getioprio() - Check if getting a task's ioprio is allowed * @p: task * * Check permission before getting the ioprio value of @p. * * Return: Returns 0 if permission is granted. */ int security_task_getioprio(struct task_struct *p) { return call_int_hook(task_getioprio, 0, p); } /** * security_task_prlimit() - Check if get/setting resources limits is allowed * @cred: current task credentials * @tcred: target task credentials * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both * * Check permission before getting and/or setting the resource limits of * another task. * * Return: Returns 0 if permission is granted. */ int security_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { return call_int_hook(task_prlimit, 0, cred, tcred, flags); } /** * security_task_setrlimit() - Check if setting a new rlimit value is allowed * @p: target task's group leader * @resource: resource whose limit is being set * @new_rlim: new resource limit * * Check permission before setting the resource limits of process @p for * @resource to @new_rlim. The old resource limit values can be examined by * dereferencing (p->signal->rlim + resource). * * Return: Returns 0 if permission is granted. */ int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { return call_int_hook(task_setrlimit, 0, p, resource, new_rlim); } /** * security_task_setscheduler() - Check if setting sched policy/param is allowed * @p: target task * * Check permission before setting scheduling policy and/or parameters of * process @p. * * Return: Returns 0 if permission is granted. */ int security_task_setscheduler(struct task_struct *p) { return call_int_hook(task_setscheduler, 0, p); } /** * security_task_getscheduler() - Check if getting scheduling info is allowed * @p: target task * * Check permission before obtaining scheduling information for process @p. * * Return: Returns 0 if permission is granted. */ int security_task_getscheduler(struct task_struct *p) { return call_int_hook(task_getscheduler, 0, p); } /** * security_task_movememory() - Check if moving memory is allowed * @p: task * * Check permission before moving memory owned by process @p. * * Return: Returns 0 if permission is granted. */ int security_task_movememory(struct task_struct *p) { return call_int_hook(task_movememory, 0, p); } /** * security_task_kill() - Check if sending a signal is allowed * @p: target process * @info: signal information * @sig: signal value * @cred: credentials of the signal sender, NULL if @current * * Check permission before sending signal @sig to @p. @info can be NULL, the * constant 1, or a pointer to a kernel_siginfo structure. If @info is 1 or * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from * the kernel and should typically be permitted. SIGIO signals are handled * separately by the send_sigiotask hook in file_security_ops. * * Return: Returns 0 if permission is granted. */ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { return call_int_hook(task_kill, 0, p, info, sig, cred); } /** * security_task_prctl() - Check if a prctl op is allowed * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Check permission before performing a process control operation on the * current process. * * Return: Return -ENOSYS if no-one wanted to handle this op, any other value * to cause prctl() to return immediately with that value. */ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int thisrc; int rc = LSM_RET_DEFAULT(task_prctl); struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) { thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5); if (thisrc != LSM_RET_DEFAULT(task_prctl)) { rc = thisrc; if (thisrc != 0) break; } } return rc; } /** * security_task_to_inode() - Set the security attributes of a task's inode * @p: task * @inode: inode * * Set the security attributes for an inode based on an associated task's * security attributes, e.g. for /proc/pid inodes. */ void security_task_to_inode(struct task_struct *p, struct inode *inode) { call_void_hook(task_to_inode, p, inode); } /** * security_create_user_ns() - Check if creating a new userns is allowed * @cred: prepared creds * * Check permission prior to creating a new user namespace. * * Return: Returns 0 if successful, otherwise < 0 error code. */ int security_create_user_ns(const struct cred *cred) { return call_int_hook(userns_create, 0, cred); } /** * security_ipc_permission() - Check if sysv ipc access is allowed * @ipcp: ipc permission structure * @flag: requested permissions * * Check permissions for access to IPC. * * Return: Returns 0 if permission is granted. */ int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { return call_int_hook(ipc_permission, 0, ipcp, flag); } /** * security_ipc_getsecid() - Get the sysv ipc object's secid * @ipcp: ipc permission structure * @secid: secid pointer * * Get the secid associated with the ipc object. In case of failure, @secid * will be set to zero. */ void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) { *secid = 0; call_void_hook(ipc_getsecid, ipcp, secid); } /** * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob * @msg: message structure * * Allocate and attach a security structure to the msg->security field. The * security field is initialized to NULL when the structure is first created. * * Return: Return 0 if operation was successful and permission is granted. */ int security_msg_msg_alloc(struct msg_msg *msg) { int rc = lsm_msg_msg_alloc(msg); if (unlikely(rc)) return rc; rc = call_int_hook(msg_msg_alloc_security, 0, msg); if (unlikely(rc)) security_msg_msg_free(msg); return rc; } /** * security_msg_msg_free() - Free a sysv ipc message LSM blob * @msg: message structure * * Deallocate the security structure for this message. */ void security_msg_msg_free(struct msg_msg *msg) { call_void_hook(msg_msg_free_security, msg); kfree(msg->security); msg->security = NULL; } /** * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob * @msq: sysv ipc permission structure * * Allocate and attach a security structure to @msg. The security field is * initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_msg_queue_alloc(struct kern_ipc_perm *msq) { int rc = lsm_ipc_alloc(msq); if (unlikely(rc)) return rc; rc = call_int_hook(msg_queue_alloc_security, 0, msq); if (unlikely(rc)) security_msg_queue_free(msq); return rc; } /** * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob * @msq: sysv ipc permission structure * * Deallocate security field @perm->security for the message queue. */ void security_msg_queue_free(struct kern_ipc_perm *msq) { call_void_hook(msg_queue_free_security, msq); kfree(msq->security); msq->security = NULL; } /** * security_msg_queue_associate() - Check if a msg queue operation is allowed * @msq: sysv ipc permission structure * @msqflg: operation flags * * Check permission when a message queue is requested through the msgget system * call. This hook is only called when returning the message queue identifier * for an existing message queue, not when a new message queue is created. * * Return: Return 0 if permission is granted. */ int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { return call_int_hook(msg_queue_associate, 0, msq, msqflg); } /** * security_msg_queue_msgctl() - Check if a msg queue operation is allowed * @msq: sysv ipc permission structure * @cmd: operation * * Check permission when a message control operation specified by @cmd is to be * performed on the message queue with permissions. * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { return call_int_hook(msg_queue_msgctl, 0, msq, cmd); } /** * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed * @msq: sysv ipc permission structure * @msg: message * @msqflg: operation flags * * Check permission before a message, @msg, is enqueued on the message queue * with permissions specified in @msq. * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg) { return call_int_hook(msg_queue_msgsnd, 0, msq, msg, msqflg); } /** * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed * @msq: sysv ipc permission structure * @msg: message * @target: target task * @type: type of message requested * @mode: operation flags * * Check permission before a message, @msg, is removed from the message queue. * The @target task structure contains a pointer to the process that will be * receiving the message (not equal to the current process when inline receives * are being performed). * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { return call_int_hook(msg_queue_msgrcv, 0, msq, msg, target, type, mode); } /** * security_shm_alloc() - Allocate a sysv shm LSM blob * @shp: sysv ipc permission structure * * Allocate and attach a security structure to the @shp security field. The * security field is initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_shm_alloc(struct kern_ipc_perm *shp) { int rc = lsm_ipc_alloc(shp); if (unlikely(rc)) return rc; rc = call_int_hook(shm_alloc_security, 0, shp); if (unlikely(rc)) security_shm_free(shp); return rc; } /** * security_shm_free() - Free a sysv shm LSM blob * @shp: sysv ipc permission structure * * Deallocate the security structure @perm->security for the memory segment. */ void security_shm_free(struct kern_ipc_perm *shp) { call_void_hook(shm_free_security, shp); kfree(shp->security); shp->security = NULL; } /** * security_shm_associate() - Check if a sysv shm operation is allowed * @shp: sysv ipc permission structure * @shmflg: operation flags * * Check permission when a shared memory region is requested through the shmget * system call. This hook is only called when returning the shared memory * region identifier for an existing region, not when a new shared memory * region is created. * * Return: Returns 0 if permission is granted. */ int security_shm_associate(struct kern_ipc_perm *shp, int shmflg) { return call_int_hook(shm_associate, 0, shp, shmflg); } /** * security_shm_shmctl() - Check if a sysv shm operation is allowed * @shp: sysv ipc permission structure * @cmd: operation * * Check permission when a shared memory control operation specified by @cmd is * to be performed on the shared memory region with permissions in @shp. * * Return: Return 0 if permission is granted. */ int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { return call_int_hook(shm_shmctl, 0, shp, cmd); } /** * security_shm_shmat() - Check if a sysv shm attach operation is allowed * @shp: sysv ipc permission structure * @shmaddr: address of memory region to attach * @shmflg: operation flags * * Check permissions prior to allowing the shmat system call to attach the * shared memory segment with permissions @shp to the data segment of the * calling process. The attaching address is specified by @shmaddr. * * Return: Returns 0 if permission is granted. */ int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) { return call_int_hook(shm_shmat, 0, shp, shmaddr, shmflg); } /** * security_sem_alloc() - Allocate a sysv semaphore LSM blob * @sma: sysv ipc permission structure * * Allocate and attach a security structure to the @sma security field. The * security field is initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_sem_alloc(struct kern_ipc_perm *sma) { int rc = lsm_ipc_alloc(sma); if (unlikely(rc)) return rc; rc = call_int_hook(sem_alloc_security, 0, sma); if (unlikely(rc)) security_sem_free(sma); return rc; } /** * security_sem_free() - Free a sysv semaphore LSM blob * @sma: sysv ipc permission structure * * Deallocate security structure @sma->security for the semaphore. */ void security_sem_free(struct kern_ipc_perm *sma) { call_void_hook(sem_free_security, sma); kfree(sma->security); sma->security = NULL; } /** * security_sem_associate() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @semflg: operation flags * * Check permission when a semaphore is requested through the semget system * call. This hook is only called when returning the semaphore identifier for * an existing semaphore, not when a new one must be created. * * Return: Returns 0 if permission is granted. */ int security_sem_associate(struct kern_ipc_perm *sma, int semflg) { return call_int_hook(sem_associate, 0, sma, semflg); } /** * security_sem_semctl() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @cmd: operation * * Check permission when a semaphore operation specified by @cmd is to be * performed on the semaphore. * * Return: Returns 0 if permission is granted. */ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd) { return call_int_hook(sem_semctl, 0, sma, cmd); } /** * security_sem_semop() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @sops: operations to perform * @nsops: number of operations * @alter: flag indicating changes will be made * * Check permissions before performing operations on members of the semaphore * set. If the @alter flag is nonzero, the semaphore set may be modified. * * Return: Returns 0 if permission is granted. */ int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter) { return call_int_hook(sem_semop, 0, sma, sops, nsops, alter); } /** * security_d_instantiate() - Populate an inode's LSM state based on a dentry * @dentry: dentry * @inode: inode * * Fill in @inode security information for a @dentry if allowed. */ void security_d_instantiate(struct dentry *dentry, struct inode *inode) { if (unlikely(inode && IS_PRIVATE(inode))) return; call_void_hook(d_instantiate, dentry, inode); } EXPORT_SYMBOL(security_d_instantiate); /* * Please keep this in sync with it's counterpart in security/lsm_syscalls.c */ /** * security_getselfattr - Read an LSM attribute of the current process. * @attr: which attribute to return * @uctx: the user-space destination for the information, or NULL * @size: pointer to the size of space available to receive the data * @flags: special handling options. LSM_FLAG_SINGLE indicates that only * attributes associated with the LSM identified in the passed @ctx be * reported. * * A NULL value for @uctx can be used to get both the number of attributes * and the size of the data. * * Returns the number of attributes found on success, negative value * on error. @size is reset to the total size of the data. * If @size is insufficient to contain the data -E2BIG is returned. */ int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx, size_t __user *size, u32 flags) { struct security_hook_list *hp; struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, }; u8 __user *base = (u8 __user *)uctx; size_t total = 0; size_t entrysize; size_t left; bool toobig = false; bool single = false; int count = 0; int rc; if (attr == LSM_ATTR_UNDEF) return -EINVAL; if (size == NULL) return -EINVAL; if (get_user(left, size)) return -EFAULT; if (flags) { /* * Only flag supported is LSM_FLAG_SINGLE */ if (flags != LSM_FLAG_SINGLE || !uctx) return -EINVAL; if (copy_from_user(&lctx, uctx, sizeof(lctx))) return -EFAULT; /* * If the LSM ID isn't specified it is an error. */ if (lctx.id == LSM_ID_UNDEF) return -EINVAL; single = true; } /* * In the usual case gather all the data from the LSMs. * In the single case only get the data from the LSM specified. */ hlist_for_each_entry(hp, &security_hook_heads.getselfattr, list) { if (single && lctx.id != hp->lsmid->id) continue; entrysize = left; if (base) uctx = (struct lsm_ctx __user *)(base + total); rc = hp->hook.getselfattr(attr, uctx, &entrysize, flags); if (rc == -EOPNOTSUPP) { rc = 0; continue; } if (rc == -E2BIG) { rc = 0; left = 0; toobig = true; } else if (rc < 0) return rc; else left -= entrysize; total += entrysize; count += rc; if (single) break; } if (put_user(total, size)) return -EFAULT; if (toobig) return -E2BIG; if (count == 0) return LSM_RET_DEFAULT(getselfattr); return count; } /* * Please keep this in sync with it's counterpart in security/lsm_syscalls.c */ /** * security_setselfattr - Set an LSM attribute on the current process. * @attr: which attribute to set * @uctx: the user-space source for the information * @size: the size of the data * @flags: reserved for future use, must be 0 * * Set an LSM attribute for the current process. The LSM, attribute * and new value are included in @uctx. * * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT * if the user buffer is inaccessible, E2BIG if size is too big, or an * LSM specific failure. */ int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx, size_t size, u32 flags) { struct security_hook_list *hp; struct lsm_ctx *lctx; int rc = LSM_RET_DEFAULT(setselfattr); if (flags) return -EINVAL; if (size < sizeof(*lctx)) return -EINVAL; if (size > PAGE_SIZE) return -E2BIG; lctx = memdup_user(uctx, size); if (IS_ERR(lctx)) return PTR_ERR(lctx); if (size < lctx->len || size < lctx->ctx_len + sizeof(*lctx) || lctx->len < lctx->ctx_len + sizeof(*lctx)) { rc = -EINVAL; goto free_out; } hlist_for_each_entry(hp, &security_hook_heads.setselfattr, list) if ((hp->lsmid->id) == lctx->id) { rc = hp->hook.setselfattr(attr, lctx, size, flags); break; } free_out: kfree(lctx); return rc; } /** * security_getprocattr() - Read an attribute for a task * @p: the task * @lsmid: LSM identification * @name: attribute name * @value: attribute value * * Read attribute @name for task @p and store it into @value if allowed. * * Return: Returns the length of @value on success, a negative value otherwise. */ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) { if (lsmid != 0 && lsmid != hp->lsmid->id) continue; return hp->hook.getprocattr(p, name, value); } return LSM_RET_DEFAULT(getprocattr); } /** * security_setprocattr() - Set an attribute for a task * @lsmid: LSM identification * @name: attribute name * @value: attribute value * @size: attribute value size * * Write (set) the current task's attribute @name to @value, size @size if * allowed. * * Return: Returns bytes written on success, a negative value otherwise. */ int security_setprocattr(int lsmid, const char *name, void *value, size_t size) { struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) { if (lsmid != 0 && lsmid != hp->lsmid->id) continue; return hp->hook.setprocattr(name, value, size); } return LSM_RET_DEFAULT(setprocattr); } /** * security_netlink_send() - Save info and check if netlink sending is allowed * @sk: sending socket * @skb: netlink message * * Save security information for a netlink message so that permission checking * can be performed when the message is processed. The security information * can be saved using the eff_cap field of the netlink_skb_parms structure. * Also may be used to provide fine grained control over message transmission. * * Return: Returns 0 if the information was successfully saved and message is * allowed to be transmitted. */ int security_netlink_send(struct sock *sk, struct sk_buff *skb) { return call_int_hook(netlink_send, 0, sk, skb); } /** * security_ismaclabel() - Check is the named attribute is a MAC label * @name: full extended attribute name * * Check if the extended attribute specified by @name represents a MAC label. * * Return: Returns 1 if name is a MAC attribute otherwise returns 0. */ int security_ismaclabel(const char *name) { return call_int_hook(ismaclabel, 0, name); } EXPORT_SYMBOL(security_ismaclabel); /** * security_secid_to_secctx() - Convert a secid to a secctx * @secid: secid * @secdata: secctx * @seclen: secctx length * * Convert secid to security context. If @secdata is NULL the length of the * result will be returned in @seclen, but no @secdata will be returned. This * does mean that the length could change between calls to check the length and * the next call which actually allocates and returns the @secdata. * * Return: Return 0 on success, error on failure. */ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { struct security_hook_list *hp; int rc; /* * Currently, only one LSM can implement secid_to_secctx (i.e this * LSM hook is not "stackable"). */ hlist_for_each_entry(hp, &security_hook_heads.secid_to_secctx, list) { rc = hp->hook.secid_to_secctx(secid, secdata, seclen); if (rc != LSM_RET_DEFAULT(secid_to_secctx)) return rc; } return LSM_RET_DEFAULT(secid_to_secctx); } EXPORT_SYMBOL(security_secid_to_secctx); /** * security_secctx_to_secid() - Convert a secctx to a secid * @secdata: secctx * @seclen: length of secctx * @secid: secid * * Convert security context to secid. * * Return: Returns 0 on success, error on failure. */ int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { *secid = 0; return call_int_hook(secctx_to_secid, 0, secdata, seclen, secid); } EXPORT_SYMBOL(security_secctx_to_secid); /** * security_release_secctx() - Free a secctx buffer * @secdata: secctx * @seclen: length of secctx * * Release the security context. */ void security_release_secctx(char *secdata, u32 seclen) { call_void_hook(release_secctx, secdata, seclen); } EXPORT_SYMBOL(security_release_secctx); /** * security_inode_invalidate_secctx() - Invalidate an inode's security label * @inode: inode * * Notify the security module that it must revalidate the security context of * an inode. */ void security_inode_invalidate_secctx(struct inode *inode) { call_void_hook(inode_invalidate_secctx, inode); } EXPORT_SYMBOL(security_inode_invalidate_secctx); /** * security_inode_notifysecctx() - Notify the LSM of an inode's security label * @inode: inode * @ctx: secctx * @ctxlen: length of secctx * * Notify the security module of what the security context of an inode should * be. Initializes the incore security context managed by the security module * for this inode. Example usage: NFS client invokes this hook to initialize * the security context in its incore inode to the value provided by the server * for the file when the server returned the file's attributes to the client. * Must be called with inode->i_mutex locked. * * Return: Returns 0 on success, error on failure. */ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { return call_int_hook(inode_notifysecctx, 0, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_notifysecctx); /** * security_inode_setsecctx() - Change the security label of an inode * @dentry: inode * @ctx: secctx * @ctxlen: length of secctx * * Change the security context of an inode. Updates the incore security * context managed by the security module and invokes the fs code as needed * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the * context. Example usage: NFS server invokes this hook to change the security * context in its incore inode and on the backing filesystem to a value * provided by the client on a SETATTR operation. Must be called with * inode->i_mutex locked. * * Return: Returns 0 on success, error on failure. */ int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return call_int_hook(inode_setsecctx, 0, dentry, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_setsecctx); /** * security_inode_getsecctx() - Get the security label of an inode * @inode: inode * @ctx: secctx * @ctxlen: length of secctx * * On success, returns 0 and fills out @ctx and @ctxlen with the security * context for the given @inode. * * Return: Returns 0 on success, error on failure. */ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { return call_int_hook(inode_getsecctx, -EOPNOTSUPP, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_getsecctx); #ifdef CONFIG_WATCH_QUEUE /** * security_post_notification() - Check if a watch notification can be posted * @w_cred: credentials of the task that set the watch * @cred: credentials of the task which triggered the watch * @n: the notification * * Check to see if a watch notification can be posted to a particular queue. * * Return: Returns 0 if permission is granted. */ int security_post_notification(const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) { return call_int_hook(post_notification, 0, w_cred, cred, n); } #endif /* CONFIG_WATCH_QUEUE */ #ifdef CONFIG_KEY_NOTIFICATIONS /** * security_watch_key() - Check if a task is allowed to watch for key events * @key: the key to watch * * Check to see if a process is allowed to watch for event notifications from * a key or keyring. * * Return: Returns 0 if permission is granted. */ int security_watch_key(struct key *key) { return call_int_hook(watch_key, 0, key); } #endif /* CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK /** * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed * @sock: originating sock * @other: peer sock * @newsk: new sock * * Check permissions before establishing a Unix domain stream connection * between @sock and @other. * * The @unix_stream_connect and @unix_may_send hooks were necessary because * Linux provides an alternative to the conventional file name space for Unix * domain sockets. Whereas binding and connecting to sockets in the file name * space is mediated by the typical file permissions (and caught by the mknod * and permission hooks in inode_security_ops), binding and connecting to * sockets in the abstract name space is completely unmediated. Sufficient * control of Unix domain sockets in the abstract name space isn't possible * using only the socket layer hooks, since we need to know the actual target * socket, which is not looked up until we are inside the af_unix code. * * Return: Returns 0 if permission is granted. */ int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { return call_int_hook(unix_stream_connect, 0, sock, other, newsk); } EXPORT_SYMBOL(security_unix_stream_connect); /** * security_unix_may_send() - Check if AF_UNIX socket can send datagrams * @sock: originating sock * @other: peer sock * * Check permissions before connecting or sending datagrams from @sock to * @other. * * The @unix_stream_connect and @unix_may_send hooks were necessary because * Linux provides an alternative to the conventional file name space for Unix * domain sockets. Whereas binding and connecting to sockets in the file name * space is mediated by the typical file permissions (and caught by the mknod * and permission hooks in inode_security_ops), binding and connecting to * sockets in the abstract name space is completely unmediated. Sufficient * control of Unix domain sockets in the abstract name space isn't possible * using only the socket layer hooks, since we need to know the actual target * socket, which is not looked up until we are inside the af_unix code. * * Return: Returns 0 if permission is granted. */ int security_unix_may_send(struct socket *sock, struct socket *other) { return call_int_hook(unix_may_send, 0, sock, other); } EXPORT_SYMBOL(security_unix_may_send); /** * security_socket_create() - Check if creating a new socket is allowed * @family: protocol family * @type: communications type * @protocol: requested protocol * @kern: set to 1 if a kernel socket is requested * * Check permissions prior to creating a new socket. * * Return: Returns 0 if permission is granted. */ int security_socket_create(int family, int type, int protocol, int kern) { return call_int_hook(socket_create, 0, family, type, protocol, kern); } /** * security_socket_post_create() - Initialize a newly created socket * @sock: socket * @family: protocol family * @type: communications type * @protocol: requested protocol * @kern: set to 1 if a kernel socket is requested * * This hook allows a module to update or allocate a per-socket security * structure. Note that the security field was not added directly to the socket * structure, but rather, the socket security information is stored in the * associated inode. Typically, the inode alloc_security hook will allocate * and attach security information to SOCK_INODE(sock)->i_security. This hook * may be used to update the SOCK_INODE(sock)->i_security field with additional * information that wasn't available when the inode was allocated. * * Return: Returns 0 if permission is granted. */ int security_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { return call_int_hook(socket_post_create, 0, sock, family, type, protocol, kern); } /** * security_socket_socketpair() - Check if creating a socketpair is allowed * @socka: first socket * @sockb: second socket * * Check permissions before creating a fresh pair of sockets. * * Return: Returns 0 if permission is granted and the connection was * established. */ int security_socket_socketpair(struct socket *socka, struct socket *sockb) { return call_int_hook(socket_socketpair, 0, socka, sockb); } EXPORT_SYMBOL(security_socket_socketpair); /** * security_socket_bind() - Check if a socket bind operation is allowed * @sock: socket * @address: requested bind address * @addrlen: length of address * * Check permission before socket protocol layer bind operation is performed * and the socket @sock is bound to the address specified in the @address * parameter. * * Return: Returns 0 if permission is granted. */ int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_bind, 0, sock, address, addrlen); } /** * security_socket_connect() - Check if a socket connect operation is allowed * @sock: socket * @address: address of remote connection point * @addrlen: length of address * * Check permission before socket protocol layer connect operation attempts to * connect socket @sock to a remote address, @address. * * Return: Returns 0 if permission is granted. */ int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_connect, 0, sock, address, addrlen); } /** * security_socket_listen() - Check if a socket is allowed to listen * @sock: socket * @backlog: connection queue size * * Check permission before socket protocol layer listen operation. * * Return: Returns 0 if permission is granted. */ int security_socket_listen(struct socket *sock, int backlog) { return call_int_hook(socket_listen, 0, sock, backlog); } /** * security_socket_accept() - Check if a socket is allowed to accept connections * @sock: listening socket * @newsock: newly creation connection socket * * Check permission before accepting a new connection. Note that the new * socket, @newsock, has been created and some information copied to it, but * the accept operation has not actually been performed. * * Return: Returns 0 if permission is granted. */ int security_socket_accept(struct socket *sock, struct socket *newsock) { return call_int_hook(socket_accept, 0, sock, newsock); } /** * security_socket_sendmsg() - Check is sending a message is allowed * @sock: sending socket * @msg: message to send * @size: size of message * * Check permission before transmitting a message to another socket. * * Return: Returns 0 if permission is granted. */ int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return call_int_hook(socket_sendmsg, 0, sock, msg, size); } /** * security_socket_recvmsg() - Check if receiving a message is allowed * @sock: receiving socket * @msg: message to receive * @size: size of message * @flags: operational flags * * Check permission before receiving a message from a socket. * * Return: Returns 0 if permission is granted. */ int security_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return call_int_hook(socket_recvmsg, 0, sock, msg, size, flags); } /** * security_socket_getsockname() - Check if reading the socket addr is allowed * @sock: socket * * Check permission before reading the local address (name) of the socket * object. * * Return: Returns 0 if permission is granted. */ int security_socket_getsockname(struct socket *sock) { return call_int_hook(socket_getsockname, 0, sock); } /** * security_socket_getpeername() - Check if reading the peer's addr is allowed * @sock: socket * * Check permission before the remote address (name) of a socket object. * * Return: Returns 0 if permission is granted. */ int security_socket_getpeername(struct socket *sock) { return call_int_hook(socket_getpeername, 0, sock); } /** * security_socket_getsockopt() - Check if reading a socket option is allowed * @sock: socket * @level: option's protocol level * @optname: option name * * Check permissions before retrieving the options associated with socket * @sock. * * Return: Returns 0 if permission is granted. */ int security_socket_getsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_getsockopt, 0, sock, level, optname); } /** * security_socket_setsockopt() - Check if setting a socket option is allowed * @sock: socket * @level: option's protocol level * @optname: option name * * Check permissions before setting the options associated with socket @sock. * * Return: Returns 0 if permission is granted. */ int security_socket_setsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_setsockopt, 0, sock, level, optname); } /** * security_socket_shutdown() - Checks if shutting down the socket is allowed * @sock: socket * @how: flag indicating how sends and receives are handled * * Checks permission before all or part of a connection on the socket @sock is * shut down. * * Return: Returns 0 if permission is granted. */ int security_socket_shutdown(struct socket *sock, int how) { return call_int_hook(socket_shutdown, 0, sock, how); } /** * security_sock_rcv_skb() - Check if an incoming network packet is allowed * @sk: destination sock * @skb: incoming packet * * Check permissions on incoming network packets. This hook is distinct from * Netfilter's IP input hooks since it is the first time that the incoming * sk_buff @skb has been associated with a particular socket, @sk. Must not * sleep inside this hook because some callers hold spinlocks. * * Return: Returns 0 if permission is granted. */ int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { return call_int_hook(socket_sock_rcv_skb, 0, sk, skb); } EXPORT_SYMBOL(security_sock_rcv_skb); /** * security_socket_getpeersec_stream() - Get the remote peer label * @sock: socket * @optval: destination buffer * @optlen: size of peer label copied into the buffer * @len: maximum size of the destination buffer * * This hook allows the security module to provide peer socket security state * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC. * For tcp sockets this can be meaningful if the socket is associated with an * ipsec SA. * * Return: Returns 0 if all is well, otherwise, typical getsockopt return * values. */ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { return call_int_hook(socket_getpeersec_stream, -ENOPROTOOPT, sock, optval, optlen, len); } /** * security_socket_getpeersec_dgram() - Get the remote peer label * @sock: socket * @skb: datagram packet * @secid: remote peer label secid * * This hook allows the security module to provide peer socket security state * for udp sockets on a per-packet basis to userspace via getsockopt * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC * option via getsockopt. It can then retrieve the security state returned by * this hook for a packet via the SCM_SECURITY ancillary message type. * * Return: Returns 0 on success, error on failure. */ int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { return call_int_hook(socket_getpeersec_dgram, -ENOPROTOOPT, sock, skb, secid); } EXPORT_SYMBOL(security_socket_getpeersec_dgram); /** * security_sk_alloc() - Allocate and initialize a sock's LSM blob * @sk: sock * @family: protocol family * @priority: gfp flags * * Allocate and attach a security structure to the sk->sk_security field, which * is used to copy security attributes between local stream sockets. * * Return: Returns 0 on success, error on failure. */ int security_sk_alloc(struct sock *sk, int family, gfp_t priority) { return call_int_hook(sk_alloc_security, 0, sk, family, priority); } /** * security_sk_free() - Free the sock's LSM blob * @sk: sock * * Deallocate security structure. */ void security_sk_free(struct sock *sk) { call_void_hook(sk_free_security, sk); } /** * security_sk_clone() - Clone a sock's LSM state * @sk: original sock * @newsk: target sock * * Clone/copy security structure. */ void security_sk_clone(const struct sock *sk, struct sock *newsk) { call_void_hook(sk_clone_security, sk, newsk); } EXPORT_SYMBOL(security_sk_clone); /** * security_sk_classify_flow() - Set a flow's secid based on socket * @sk: original socket * @flic: target flow * * Set the target flow's secid to socket's secid. */ void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic) { call_void_hook(sk_getsecid, sk, &flic->flowic_secid); } EXPORT_SYMBOL(security_sk_classify_flow); /** * security_req_classify_flow() - Set a flow's secid based on request_sock * @req: request_sock * @flic: target flow * * Sets @flic's secid to @req's secid. */ void security_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { call_void_hook(req_classify_flow, req, flic); } EXPORT_SYMBOL(security_req_classify_flow); /** * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket * @sk: sock being grafted * @parent: target parent socket * * Sets @parent's inode secid to @sk's secid and update @sk with any necessary * LSM state from @parent. */ void security_sock_graft(struct sock *sk, struct socket *parent) { call_void_hook(sock_graft, sk, parent); } EXPORT_SYMBOL(security_sock_graft); /** * security_inet_conn_request() - Set request_sock state using incoming connect * @sk: parent listening sock * @skb: incoming connection * @req: new request_sock * * Initialize the @req LSM state based on @sk and the incoming connect in @skb. * * Return: Returns 0 if permission is granted. */ int security_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { return call_int_hook(inet_conn_request, 0, sk, skb, req); } EXPORT_SYMBOL(security_inet_conn_request); /** * security_inet_csk_clone() - Set new sock LSM state based on request_sock * @newsk: new sock * @req: connection request_sock * * Set that LSM state of @sock using the LSM state from @req. */ void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req) { call_void_hook(inet_csk_clone, newsk, req); } /** * security_inet_conn_established() - Update sock's LSM state with connection * @sk: sock * @skb: connection packet * * Update @sock's LSM state to represent a new connection from @skb. */ void security_inet_conn_established(struct sock *sk, struct sk_buff *skb) { call_void_hook(inet_conn_established, sk, skb); } EXPORT_SYMBOL(security_inet_conn_established); /** * security_secmark_relabel_packet() - Check if setting a secmark is allowed * @secid: new secmark value * * Check if the process should be allowed to relabel packets to @secid. * * Return: Returns 0 if permission is granted. */ int security_secmark_relabel_packet(u32 secid) { return call_int_hook(secmark_relabel_packet, 0, secid); } EXPORT_SYMBOL(security_secmark_relabel_packet); /** * security_secmark_refcount_inc() - Increment the secmark labeling rule count * * Tells the LSM to increment the number of secmark labeling rules loaded. */ void security_secmark_refcount_inc(void) { call_void_hook(secmark_refcount_inc); } EXPORT_SYMBOL(security_secmark_refcount_inc); /** * security_secmark_refcount_dec() - Decrement the secmark labeling rule count * * Tells the LSM to decrement the number of secmark labeling rules loaded. */ void security_secmark_refcount_dec(void) { call_void_hook(secmark_refcount_dec); } EXPORT_SYMBOL(security_secmark_refcount_dec); /** * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device * @security: pointer to the LSM blob * * This hook allows a module to allocate a security structure for a TUN device, * returning the pointer in @security. * * Return: Returns a zero on success, negative values on failure. */ int security_tun_dev_alloc_security(void **security) { return call_int_hook(tun_dev_alloc_security, 0, security); } EXPORT_SYMBOL(security_tun_dev_alloc_security); /** * security_tun_dev_free_security() - Free a TUN device LSM blob * @security: LSM blob * * This hook allows a module to free the security structure for a TUN device. */ void security_tun_dev_free_security(void *security) { call_void_hook(tun_dev_free_security, security); } EXPORT_SYMBOL(security_tun_dev_free_security); /** * security_tun_dev_create() - Check if creating a TUN device is allowed * * Check permissions prior to creating a new TUN device. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_create(void) { return call_int_hook(tun_dev_create, 0); } EXPORT_SYMBOL(security_tun_dev_create); /** * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed * @security: TUN device LSM blob * * Check permissions prior to attaching to a TUN device queue. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_attach_queue(void *security) { return call_int_hook(tun_dev_attach_queue, 0, security); } EXPORT_SYMBOL(security_tun_dev_attach_queue); /** * security_tun_dev_attach() - Update TUN device LSM state on attach * @sk: associated sock * @security: TUN device LSM blob * * This hook can be used by the module to update any security state associated * with the TUN device's sock structure. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_attach(struct sock *sk, void *security) { return call_int_hook(tun_dev_attach, 0, sk, security); } EXPORT_SYMBOL(security_tun_dev_attach); /** * security_tun_dev_open() - Update TUN device LSM state on open * @security: TUN device LSM blob * * This hook can be used by the module to update any security state associated * with the TUN device's security structure. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_open(void *security) { return call_int_hook(tun_dev_open, 0, security); } EXPORT_SYMBOL(security_tun_dev_open); /** * security_sctp_assoc_request() - Update the LSM on a SCTP association req * @asoc: SCTP association * @skb: packet requesting the association * * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM. * * Return: Returns 0 on success, error on failure. */ int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return call_int_hook(sctp_assoc_request, 0, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_request); /** * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option * @sk: socket * @optname: SCTP option to validate * @address: list of IP addresses to validate * @addrlen: length of the address list * * Validiate permissions required for each address associated with sock @sk. * Depending on @optname, the addresses will be treated as either a connect or * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6). * * Return: Returns 0 on success, error on failure. */ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { return call_int_hook(sctp_bind_connect, 0, sk, optname, address, addrlen); } EXPORT_SYMBOL(security_sctp_bind_connect); /** * security_sctp_sk_clone() - Clone a SCTP sock's LSM state * @asoc: SCTP association * @sk: original sock * @newsk: target sock * * Called whenever a new socket is created by accept(2) (i.e. a TCP style * socket) or when a socket is 'peeled off' e.g userspace calls * sctp_peeloff(3). */ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { call_void_hook(sctp_sk_clone, asoc, sk, newsk); } EXPORT_SYMBOL(security_sctp_sk_clone); /** * security_sctp_assoc_established() - Update LSM state when assoc established * @asoc: SCTP association * @skb: packet establishing the association * * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the * security module. * * Return: Returns 0 if permission is granted. */ int security_sctp_assoc_established(struct sctp_association *asoc, struct sk_buff *skb) { return call_int_hook(sctp_assoc_established, 0, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_established); /** * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket * @sk: the owning MPTCP socket * @ssk: the new subflow * * Update the labeling for the given MPTCP subflow, to match the one of the * owning MPTCP socket. This hook has to be called after the socket creation and * initialization via the security_socket_create() and * security_socket_post_create() LSM hooks. * * Return: Returns 0 on success or a negative error code on failure. */ int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk) { return call_int_hook(mptcp_add_subflow, 0, sk, ssk); } #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND /** * security_ib_pkey_access() - Check if access to an IB pkey is allowed * @sec: LSM blob * @subnet_prefix: subnet prefix of the port * @pkey: IB pkey * * Check permission to access a pkey when modifying a QP. * * Return: Returns 0 if permission is granted. */ int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey) { return call_int_hook(ib_pkey_access, 0, sec, subnet_prefix, pkey); } EXPORT_SYMBOL(security_ib_pkey_access); /** * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed * @sec: LSM blob * @dev_name: IB device name * @port_num: port number * * Check permissions to send and receive SMPs on a end port. * * Return: Returns 0 if permission is granted. */ int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num) { return call_int_hook(ib_endport_manage_subnet, 0, sec, dev_name, port_num); } EXPORT_SYMBOL(security_ib_endport_manage_subnet); /** * security_ib_alloc_security() - Allocate an Infiniband LSM blob * @sec: LSM blob * * Allocate a security structure for Infiniband objects. * * Return: Returns 0 on success, non-zero on failure. */ int security_ib_alloc_security(void **sec) { return call_int_hook(ib_alloc_security, 0, sec); } EXPORT_SYMBOL(security_ib_alloc_security); /** * security_ib_free_security() - Free an Infiniband LSM blob * @sec: LSM blob * * Deallocate an Infiniband security structure. */ void security_ib_free_security(void *sec) { call_void_hook(ib_free_security, sec); } EXPORT_SYMBOL(security_ib_free_security); #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM /** * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob * @ctxp: xfrm security context being added to the SPD * @sec_ctx: security label provided by userspace * @gfp: gfp flags * * Allocate a security structure to the xp->security field; the security field * is initialized to NULL when the xfrm_policy is allocated. * * Return: Return 0 if operation was successful. */ int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) { return call_int_hook(xfrm_policy_alloc_security, 0, ctxp, sec_ctx, gfp); } EXPORT_SYMBOL(security_xfrm_policy_alloc); /** * security_xfrm_policy_clone() - Clone xfrm policy LSM state * @old_ctx: xfrm security context * @new_ctxp: target xfrm security context * * Allocate a security structure in new_ctxp that contains the information from * the old_ctx structure. * * Return: Return 0 if operation was successful. */ int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp) { return call_int_hook(xfrm_policy_clone_security, 0, old_ctx, new_ctxp); } /** * security_xfrm_policy_free() - Free a xfrm security context * @ctx: xfrm security context * * Free LSM resources associated with @ctx. */ void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx) { call_void_hook(xfrm_policy_free_security, ctx); } EXPORT_SYMBOL(security_xfrm_policy_free); /** * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed * @ctx: xfrm security context * * Authorize deletion of a SPD entry. * * Return: Returns 0 if permission is granted. */ int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { return call_int_hook(xfrm_policy_delete_security, 0, ctx); } /** * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob * @x: xfrm state being added to the SAD * @sec_ctx: security label provided by userspace * * Allocate a security structure to the @x->security field; the security field * is initialized to NULL when the xfrm_state is allocated. Set the context to * correspond to @sec_ctx. * * Return: Return 0 if operation was successful. */ int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { return call_int_hook(xfrm_state_alloc, 0, x, sec_ctx); } EXPORT_SYMBOL(security_xfrm_state_alloc); /** * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob * @x: xfrm state being added to the SAD * @polsec: associated policy's security context * @secid: secid from the flow * * Allocate a security structure to the x->security field; the security field * is initialized to NULL when the xfrm_state is allocated. Set the context to * correspond to secid. * * Return: Returns 0 if operation was successful. */ int security_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) { return call_int_hook(xfrm_state_alloc_acquire, 0, x, polsec, secid); } /** * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed * @x: xfrm state * * Authorize deletion of x->security. * * Return: Returns 0 if permission is granted. */ int security_xfrm_state_delete(struct xfrm_state *x) { return call_int_hook(xfrm_state_delete_security, 0, x); } EXPORT_SYMBOL(security_xfrm_state_delete); /** * security_xfrm_state_free() - Free a xfrm state * @x: xfrm state * * Deallocate x->security. */ void security_xfrm_state_free(struct xfrm_state *x) { call_void_hook(xfrm_state_free_security, x); } /** * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed * @ctx: target xfrm security context * @fl_secid: flow secid used to authorize access * * Check permission when a flow selects a xfrm_policy for processing XFRMs on a * packet. The hook is called when selecting either a per-socket policy or a * generic xfrm policy. * * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on * other errors. */ int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) { return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid); } /** * security_xfrm_state_pol_flow_match() - Check for a xfrm match * @x: xfrm state to match * @xp: xfrm policy to check for a match * @flic: flow to check for a match. * * Check @xp and @flic for a match with @x. * * Return: Returns 1 if there is a match. */ int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) { struct security_hook_list *hp; int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match); /* * Since this function is expected to return 0 or 1, the judgment * becomes difficult if multiple LSMs supply this call. Fortunately, * we can use the first LSM's judgment because currently only SELinux * supplies this call. * * For speed optimization, we explicitly break the loop rather than * using the macro */ hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match, list) { rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic); break; } return rc; } /** * security_xfrm_decode_session() - Determine the xfrm secid for a packet * @skb: xfrm packet * @secid: secid * * Decode the packet in @skb and return the security label in @secid. * * Return: Return 0 if all xfrms used have the same secid. */ int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) { return call_int_hook(xfrm_decode_session, 0, skb, secid, 1); } void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic) { int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid, 0); BUG_ON(rc); } EXPORT_SYMBOL(security_skb_classify_flow); #endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS /** * security_key_alloc() - Allocate and initialize a kernel key LSM blob * @key: key * @cred: credentials * @flags: allocation flags * * Permit allocation of a key and assign security data. Note that key does not * have a serial number assigned at this point. * * Return: Return 0 if permission is granted, -ve error otherwise. */ int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { return call_int_hook(key_alloc, 0, key, cred, flags); } /** * security_key_free() - Free a kernel key LSM blob * @key: key * * Notification of destruction; free security data. */ void security_key_free(struct key *key) { call_void_hook(key_free, key); } /** * security_key_permission() - Check if a kernel key operation is allowed * @key_ref: key reference * @cred: credentials of actor requesting access * @need_perm: requested permissions * * See whether a specific operational right is granted to a process on a key. * * Return: Return 0 if permission is granted, -ve error otherwise. */ int security_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { return call_int_hook(key_permission, 0, key_ref, cred, need_perm); } /** * security_key_getsecurity() - Get the key's security label * @key: key * @buffer: security label buffer * * Get a textual representation of the security context attached to a key for * the purposes of honouring KEYCTL_GETSECURITY. This function allocates the * storage for the NUL-terminated string and the caller should free it. * * Return: Returns the length of @buffer (including terminating NUL) or -ve if * an error occurs. May also return 0 (and a NULL buffer pointer) if * there is no security label assigned to the key. */ int security_key_getsecurity(struct key *key, char **buffer) { *buffer = NULL; return call_int_hook(key_getsecurity, 0, key, buffer); } #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT /** * security_audit_rule_init() - Allocate and init an LSM audit rule struct * @field: audit action * @op: rule operator * @rulestr: rule context * @lsmrule: receive buffer for audit rule struct * * Allocate and initialize an LSM audit rule structure. * * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of * an invalid rule. */ int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule) { return call_int_hook(audit_rule_init, 0, field, op, rulestr, lsmrule); } /** * security_audit_rule_known() - Check if an audit rule contains LSM fields * @krule: audit rule * * Specifies whether given @krule contains any fields related to the current * LSM. * * Return: Returns 1 in case of relation found, 0 otherwise. */ int security_audit_rule_known(struct audit_krule *krule) { return call_int_hook(audit_rule_known, 0, krule); } /** * security_audit_rule_free() - Free an LSM audit rule struct * @lsmrule: audit rule struct * * Deallocate the LSM audit rule structure previously allocated by * audit_rule_init(). */ void security_audit_rule_free(void *lsmrule) { call_void_hook(audit_rule_free, lsmrule); } /** * security_audit_rule_match() - Check if a label matches an audit rule * @secid: security label * @field: LSM audit field * @op: matching operator * @lsmrule: audit rule * * Determine if given @secid matches a rule previously approved by * security_audit_rule_known(). * * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on * failure. */ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule) { return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule); } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL /** * security_bpf() - Check if the bpf syscall operation is allowed * @cmd: command * @attr: bpf attribute * @size: size * * Do a initial check for all bpf syscalls after the attribute is copied into * the kernel. The actual security module can implement their own rules to * check the specific cmd they need. * * Return: Returns 0 if permission is granted. */ int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) { return call_int_hook(bpf, 0, cmd, attr, size); } /** * security_bpf_map() - Check if access to a bpf map is allowed * @map: bpf map * @fmode: mode * * Do a check when the kernel generates and returns a file descriptor for eBPF * maps. * * Return: Returns 0 if permission is granted. */ int security_bpf_map(struct bpf_map *map, fmode_t fmode) { return call_int_hook(bpf_map, 0, map, fmode); } /** * security_bpf_prog() - Check if access to a bpf program is allowed * @prog: bpf program * * Do a check when the kernel generates and returns a file descriptor for eBPF * programs. * * Return: Returns 0 if permission is granted. */ int security_bpf_prog(struct bpf_prog *prog) { return call_int_hook(bpf_prog, 0, prog); } /** * security_bpf_map_alloc() - Allocate a bpf map LSM blob * @map: bpf map * * Initialize the security field inside bpf map. * * Return: Returns 0 on success, error on failure. */ int security_bpf_map_alloc(struct bpf_map *map) { return call_int_hook(bpf_map_alloc_security, 0, map); } /** * security_bpf_prog_alloc() - Allocate a bpf program LSM blob * @aux: bpf program aux info struct * * Initialize the security field inside bpf program. * * Return: Returns 0 on success, error on failure. */ int security_bpf_prog_alloc(struct bpf_prog_aux *aux) { return call_int_hook(bpf_prog_alloc_security, 0, aux); } /** * security_bpf_map_free() - Free a bpf map's LSM blob * @map: bpf map * * Clean up the security information stored inside bpf map. */ void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free_security, map); } /** * security_bpf_prog_free() - Free a bpf program's LSM blob * @aux: bpf program aux info struct * * Clean up the security information stored inside bpf prog. */ void security_bpf_prog_free(struct bpf_prog_aux *aux) { call_void_hook(bpf_prog_free_security, aux); } #endif /* CONFIG_BPF_SYSCALL */ /** * security_locked_down() - Check if a kernel feature is allowed * @what: requested kernel feature * * Determine whether a kernel feature that potentially enables arbitrary code * execution in kernel space should be permitted. * * Return: Returns 0 if permission is granted. */ int security_locked_down(enum lockdown_reason what) { return call_int_hook(locked_down, 0, what); } EXPORT_SYMBOL(security_locked_down); #ifdef CONFIG_PERF_EVENTS /** * security_perf_event_open() - Check if a perf event open is allowed * @attr: perf event attribute * @type: type of event * * Check whether the @type of perf_event_open syscall is allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_open(struct perf_event_attr *attr, int type) { return call_int_hook(perf_event_open, 0, attr, type); } /** * security_perf_event_alloc() - Allocate a perf event LSM blob * @event: perf event * * Allocate and save perf_event security info. * * Return: Returns 0 on success, error on failure. */ int security_perf_event_alloc(struct perf_event *event) { return call_int_hook(perf_event_alloc, 0, event); } /** * security_perf_event_free() - Free a perf event LSM blob * @event: perf event * * Release (free) perf_event security info. */ void security_perf_event_free(struct perf_event *event) { call_void_hook(perf_event_free, event); } /** * security_perf_event_read() - Check if reading a perf event label is allowed * @event: perf event * * Read perf_event security info if allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_read(struct perf_event *event) { return call_int_hook(perf_event_read, 0, event); } /** * security_perf_event_write() - Check if writing a perf event label is allowed * @event: perf event * * Write perf_event security info if allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_write(struct perf_event *event) { return call_int_hook(perf_event_write, 0, event); } #endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_IO_URING /** * security_uring_override_creds() - Check if overriding creds is allowed * @new: new credentials * * Check if the current task, executing an io_uring operation, is allowed to * override it's credentials with @new. * * Return: Returns 0 if permission is granted. */ int security_uring_override_creds(const struct cred *new) { return call_int_hook(uring_override_creds, 0, new); } /** * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed * * Check whether the current task is allowed to spawn a io_uring polling thread * (IORING_SETUP_SQPOLL). * * Return: Returns 0 if permission is granted. */ int security_uring_sqpoll(void) { return call_int_hook(uring_sqpoll, 0); } /** * security_uring_cmd() - Check if a io_uring passthrough command is allowed * @ioucmd: command * * Check whether the file_operations uring_cmd is allowed to run. * * Return: Returns 0 if permission is granted. */ int security_uring_cmd(struct io_uring_cmd *ioucmd) { return call_int_hook(uring_cmd, 0, ioucmd); } #endif /* CONFIG_IO_URING */ |
| 11 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2019 Axis Communications AB * * Based on ttyprintk.c: * Copyright (C) 2010 Samo Pogacnik */ #include <linux/console.h> #include <linux/module.h> #include <linux/tty.h> static const struct tty_port_operations ttynull_port_ops; static struct tty_driver *ttynull_driver; static struct tty_port ttynull_port; static int ttynull_open(struct tty_struct *tty, struct file *filp) { return tty_port_open(&ttynull_port, tty, filp); } static void ttynull_close(struct tty_struct *tty, struct file *filp) { tty_port_close(&ttynull_port, tty, filp); } static void ttynull_hangup(struct tty_struct *tty) { tty_port_hangup(&ttynull_port); } static ssize_t ttynull_write(struct tty_struct *tty, const u8 *buf, size_t count) { return count; } static unsigned int ttynull_write_room(struct tty_struct *tty) { return 65536; } static const struct tty_operations ttynull_ops = { .open = ttynull_open, .close = ttynull_close, .hangup = ttynull_hangup, .write = ttynull_write, .write_room = ttynull_write_room, }; static struct tty_driver *ttynull_device(struct console *c, int *index) { *index = 0; return ttynull_driver; } static struct console ttynull_console = { .name = "ttynull", .device = ttynull_device, }; static int __init ttynull_init(void) { struct tty_driver *driver; int ret; driver = tty_alloc_driver(1, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_UNNUMBERED_NODE); if (IS_ERR(driver)) return PTR_ERR(driver); tty_port_init(&ttynull_port); ttynull_port.ops = &ttynull_port_ops; driver->driver_name = "ttynull"; driver->name = "ttynull"; driver->type = TTY_DRIVER_TYPE_CONSOLE; driver->init_termios = tty_std_termios; driver->init_termios.c_oflag = OPOST | OCRNL | ONOCR | ONLRET; tty_set_operations(driver, &ttynull_ops); tty_port_link_device(&ttynull_port, driver, 0); ret = tty_register_driver(driver); if (ret < 0) { tty_driver_kref_put(driver); tty_port_destroy(&ttynull_port); return ret; } ttynull_driver = driver; register_console(&ttynull_console); return 0; } static void __exit ttynull_exit(void) { unregister_console(&ttynull_console); tty_unregister_driver(ttynull_driver); tty_driver_kref_put(ttynull_driver); tty_port_destroy(&ttynull_port); } module_init(ttynull_init); module_exit(ttynull_exit); MODULE_LICENSE("GPL v2"); |
| 1882 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arm/arm64/include/asm/current.h * * Copyright (C) 2016 ARM * Copyright (C) 2017 SiFive */ #ifndef _ASM_RISCV_CURRENT_H #define _ASM_RISCV_CURRENT_H #include <linux/bug.h> #include <linux/compiler.h> #ifndef __ASSEMBLY__ struct task_struct; register struct task_struct *riscv_current_is_tp __asm__("tp"); /* * This only works because "struct thread_info" is at offset 0 from "struct * task_struct". This constraint seems to be necessary on other architectures * as well, but __switch_to enforces it. We can't check TASK_TI here because * <asm/asm-offsets.h> includes this, and I can't get the definition of "struct * task_struct" here due to some header ordering problems. */ static __always_inline struct task_struct *get_current(void) { return riscv_current_is_tp; } #define current get_current() register unsigned long current_stack_pointer __asm__("sp"); #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_CURRENT_H */ |
| 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 115 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> * * Architecture independence: * Copyright (c) 2005, Bull S.A. * Written by Pierre Peiffer <pierre.peiffer@bull.net> */ /* * Extents support for EXT4 * * TODO: * - ext4*_error() should be used in some situations * - analyze all BUG()/BUG_ON(), use -EIO where appropriate * - smart tree reduction */ #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" #include <trace/events/ext4.h> /* * used by extent splitting. */ #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ due to ENOSPC */ #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } static int ext4_extent_block_csum_verify(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; if (!ext4_has_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); if (et->et_checksum != ext4_extent_block_csum(inode, eh)) return 0; return 1; } static void ext4_extent_block_csum_set(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent_tail *et; if (!ext4_has_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); et->et_checksum = ext4_extent_block_csum(inode, eh); } static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { /* * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this * moment, get_block can be called only for blocks inside i_size since * page cache has been already dropped and writes are blocked by * i_rwsem. So we can safely drop the i_data_sem here. */ BUG_ON(EXT4_JOURNAL(inode) == NULL); ext4_discard_preallocations(inode, 0); up_write(&EXT4_I(inode)->i_data_sem); *dropped = 1; return 0; } static void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; if (!path) return; depth = path->p_depth; for (i = 0; i <= depth; i++, path++) { brelse(path->p_bh); path->p_bh = NULL; } } void ext4_free_ext_path(struct ext4_ext_path *path) { ext4_ext_drop_refs(path); kfree(path); } /* * Make sure 'handle' has at least 'check_cred' credits. If not, restart * transaction with 'restart_cred' credits. The function drops i_data_sem * when restarting transaction and gets it after transaction is restarted. * * The function returns 0 on success, 1 if transaction had to be restarted, * and < 0 in case of fatal error. */ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred) { int ret; int dropped = 0; ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred, revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped)); if (dropped) down_write(&EXT4_I(inode)->i_data_sem); return ret; } /* * could return: * - EROFS * - ENOMEM */ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err = 0; if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, path->p_bh, EXT4_JTR_NONE); /* * The extent buffer's verified bit will be set again in * __ext4_ext_dirty(). We could leave an inconsistent * buffer if the extents updating procudure break off du * to some error happens, force to check it again. */ if (!err) clear_buffer_verified(path->p_bh); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ return err; } /* * could return: * - EROFS * - ENOMEM * - EIO */ static int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { int err; WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); if (path->p_bh) { ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); /* Extents updating done, re-set verified flag */ if (!err) set_buffer_verified(path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); } return err; } #define ext4_ext_dirty(handle, inode, path) \ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { if (path) { int depth = path->p_depth; struct ext4_extent *ex; /* * Try to predict block placement assuming that we are * filling in a file which will eventually be * non-sparse --- i.e., in the case of libbfd writing * an ELF object sections out-of-order but in a way * the eventually results in a contiguous object or * executable file, or some database extending a table * space file. However, this is actually somewhat * non-ideal if we are writing a sparse file such as * qemu or KVM writing a raw image file that is going * to stay fairly sparse, since it will end up * fragmenting the file system's free space. Maybe we * should have some hueristics or some way to allow * userspace to pass a hint to file system, * especially if the latter case turns out to be * common. */ ex = path[depth].p_ext; if (ex) { ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); if (block > ext_block) return ext_pblk + (block - ext_block); else return ext_pblk - (ext_block - block); } /* it looks like index is empty; * try to find starting block from index itself */ if (path[depth].p_bh) return path[depth].p_bh->b_blocknr; } /* OK. use inode's group */ return ext4_inode_to_goal_block(inode); } /* * Allocation for a meta data block */ static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex, int *err, unsigned int flags) { ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, err); return newblock; } static inline int ext4_ext_space_block(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 6) size = 6; #endif return size; } static inline int ext4_ext_space_block_idx(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 5) size = 5; #endif return size; } static inline int ext4_ext_space_root(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); #ifdef AGGRESSIVE_TEST if (!check && size > 3) size = 3; #endif return size; } static inline int ext4_ext_space_root_idx(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); #ifdef AGGRESSIVE_TEST if (!check && size > 4) size = 4; #endif return size; } static inline int ext4_force_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t lblk, int nofail) { struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, flags); } static int ext4_ext_max_entries(struct inode *inode, int depth) { int max; if (depth == ext_depth(inode)) { if (depth == 0) max = ext4_ext_space_root(inode, 1); else max = ext4_ext_space_root_idx(inode, 1); } else { if (depth == 0) max = ext4_ext_space_block(inode, 1); else max = ext4_ext_space_block_idx(inode, 1); } return max; } static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); /* * We allow neither: * - zero length * - overflow/wrap-around */ if (lblock + len <= lblock) return 0; return ext4_inode_block_valid(inode, block, len); } static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { ext4_fsblk_t block = ext4_idx_pblock(ext_idx); return ext4_inode_block_valid(inode, block, 1); } static int ext4_valid_extent_entries(struct inode *inode, struct ext4_extent_header *eh, ext4_lblk_t lblk, ext4_fsblk_t *pblk, int depth) { unsigned short entries; ext4_lblk_t lblock = 0; ext4_lblk_t cur = 0; if (eh->eh_entries == 0) return 1; entries = le16_to_cpu(eh->eh_entries); if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); /* * The logical block in the first entry should equal to * the number in the index block. */ if (depth != ext_depth(inode) && lblk != le32_to_cpu(ext->ee_block)) return 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); if (lblock < cur) { *pblk = ext4_ext_pblock(ext); return 0; } cur = lblock + ext4_ext_get_actual_len(ext); ext++; entries--; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); /* * The logical block in the first entry should equal to * the number in the parent index block. */ if (depth != ext_depth(inode) && lblk != le32_to_cpu(ext_idx->ei_block)) return 0; while (entries) { if (!ext4_valid_extent_idx(inode, ext_idx)) return 0; /* Check for overlapping index extents */ lblock = le32_to_cpu(ext_idx->ei_block); if (lblock < cur) { *pblk = ext4_idx_pblock(ext_idx); return 0; } ext_idx++; entries--; cur = lblock + 1; } } return 1; } static int __ext4_ext_check(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_header *eh, int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk) { const char *error_msg; int max = 0, err = -EFSCORRUPTED; if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { error_msg = "invalid magic"; goto corrupted; } if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { error_msg = "unexpected eh_depth"; goto corrupted; } if (unlikely(eh->eh_max == 0)) { error_msg = "invalid eh_max"; goto corrupted; } max = ext4_ext_max_entries(inode, depth); if (unlikely(le16_to_cpu(eh->eh_max) > max)) { error_msg = "too large eh_max"; goto corrupted; } if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { error_msg = "invalid eh_entries"; goto corrupted; } if (unlikely((eh->eh_entries == 0) && (depth > 0))) { error_msg = "eh_entries is 0 but eh_depth is > 0"; goto corrupted; } if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } if (unlikely(depth > 32)) { error_msg = "too large eh_depth"; goto corrupted; } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { error_msg = "extent tree corrupted"; err = -EFSBADCRC; goto corrupted; } return 0; corrupted: ext4_error_inode_err(inode, function, line, 0, -err, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", (unsigned long long) pblk, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); return err; } #define ext4_ext_check(inode, eh, depth, pblk) \ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0) int ext4_ext_check_inode(struct inode *inode) { return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } static void ext4_cache_extents(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); ext4_lblk_t prev = 0; int i; for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { unsigned int status = EXTENT_STATUS_WRITTEN; ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); int len = ext4_ext_get_actual_len(ex); if (prev && (prev != lblk)) ext4_es_cache_extent(inode, prev, lblk - prev, ~0, EXTENT_STATUS_HOLE); if (ext4_ext_is_unwritten(ex)) status = EXTENT_STATUS_UNWRITTEN; ext4_es_cache_extent(inode, lblk, len, ext4_ext_pblock(ex), status); prev = lblk + len; } } static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, struct ext4_extent_idx *idx, int depth, int flags) { struct buffer_head *bh; int err; gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; ext4_fsblk_t pblk; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; pblk = ext4_idx_pblock(idx); bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); err = ext4_read_bh(bh, 0, NULL); if (err < 0) goto errout; } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh), depth, pblk, le32_to_cpu(idx->ei_block)); if (err) goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { struct ext4_extent_header *eh = ext_block_hdr(bh); ext4_cache_extents(inode, eh); } return bh; errout: put_bh(bh); return ERR_PTR(err); } #define read_extent_tree_block(inode, idx, depth, flags) \ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \ (depth), (flags)) /* * This function is called to cache a file's extent information in the * extent status tree */ int ext4_ext_precache(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_ext_path *path = NULL; struct buffer_head *bh; int i = 0, depth, ret = 0; if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ down_read(&ei->i_data_sem); depth = ext_depth(inode); /* Don't cache anything if there are no external extent blocks */ if (!depth) { up_read(&ei->i_data_sem); return ret; } path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS); if (path == NULL) { up_read(&ei->i_data_sem); return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); if (ret) goto out; path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); while (i >= 0) { /* * If this is a leaf block or we've reached the end of * the index block, go up */ if ((i == depth) || path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { brelse(path[i].p_bh); path[i].p_bh = NULL; i--; continue; } bh = read_extent_tree_block(inode, path[i].p_idx++, depth - i - 1, EXT4_EX_FORCE_CACHE); if (IS_ERR(bh)) { ret = PTR_ERR(bh); break; } i++; path[i].p_bh = bh; path[i].p_hdr = ext_block_hdr(bh); path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); } ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); out: up_read(&ei->i_data_sem); ext4_free_ext_path(path); return ret; } #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { int k, l = path->p_depth; ext_debug(inode, "path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { ext_debug(inode, " %d->%llu", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(inode, " %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else ext_debug(inode, " []"); } ext_debug(inode, "\n"); } static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) { int depth = ext_depth(inode); struct ext4_extent_header *eh; struct ext4_extent *ex; int i; if (!path) return; eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); ext_debug(inode, "Displaying leaf extents\n"); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } ext_debug(inode, "\n"); } static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ext4_fsblk_t newblock, int level) { int depth = ext_depth(inode); struct ext4_extent *ex; if (depth != level) { struct ext4_extent_idx *idx; idx = path[level].p_idx; while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { ext_debug(inode, "%d: move %d:%llu in new index %llu\n", level, le32_to_cpu(idx->ei_block), ext4_idx_pblock(idx), newblock); idx++; } return; } ex = path[depth].p_ext; while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), newblock); ex++; } } #else #define ext4_ext_show_path(inode, path) #define ext4_ext_show_leaf(inode, path) #define ext4_ext_show_move(inode, path, newblock, level) #endif /* * ext4_ext_binsearch_idx: * binary search for the closest index of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent_idx *r, *l, *m; ext_debug(inode, "binsearch for %u(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; r = EXT_LAST_INDEX(eh); while (l <= r) { m = l + (r - l) / 2; ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), r, le32_to_cpu(r->ei_block)); if (block < le32_to_cpu(m->ei_block)) r = m - 1; else l = m + 1; } path->p_idx = l - 1; ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH { struct ext4_extent_idx *chix, *ix; int k; chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { if (k != 0 && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); printk(KERN_DEBUG "%u <= %u\n", le32_to_cpu(ix->ei_block), le32_to_cpu(ix[-1].ei_block)); } BUG_ON(k && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)); if (block < le32_to_cpu(ix->ei_block)) break; chix = ix; } BUG_ON(chix != path->p_idx); } #endif } /* * ext4_ext_binsearch: * binary search for closest extent of the given block * the header must be checked before calling this */ static void ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { struct ext4_extent_header *eh = path->p_hdr; struct ext4_extent *r, *l, *m; if (eh->eh_entries == 0) { /* * this leaf is empty: * we get such a leaf in split/add case */ return; } ext_debug(inode, "binsearch for %u: ", block); l = EXT_FIRST_EXTENT(eh) + 1; r = EXT_LAST_EXTENT(eh); while (l <= r) { m = l + (r - l) / 2; ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), r, le32_to_cpu(r->ee_block)); if (block < le32_to_cpu(m->ee_block)) r = m - 1; else l = m + 1; } path->p_ext = l - 1; ext_debug(inode, " -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH { struct ext4_extent *chex, *ex; int k; chex = ex = EXT_FIRST_EXTENT(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { BUG_ON(k && le32_to_cpu(ex->ee_block) <= le32_to_cpu(ex[-1].ee_block)); if (block < le32_to_cpu(ex->ee_block)) break; chex = ex; } BUG_ON(chex != path->p_ext); } #endif } void ext4_ext_tree_init(handle_t *handle, struct inode *inode) { struct ext4_extent_header *eh; eh = ext_inode_hdr(inode); eh->eh_depth = 0; eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); eh->eh_generation = 0; ext4_mark_inode_dirty(handle, inode); } struct ext4_ext_path * ext4_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_ext_path **orig_path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; gfp_t gfp_flags = GFP_NOFS; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; eh = ext_inode_hdr(inode); depth = ext_depth(inode); if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) { EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d", depth); ret = -EFSCORRUPTED; goto err; } if (path) { ext4_ext_drop_refs(path); if (depth > path[0].p_maxdepth) { kfree(path); *orig_path = path = NULL; } } if (!path) { /* account possible depth increase */ path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), gfp_flags); if (unlikely(!path)) return ERR_PTR(-ENOMEM); path[0].p_maxdepth = depth + 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; i = depth; if (!(flags & EXT4_EX_NOCACHE) && depth == 0) ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { ext_debug(inode, "depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); path[ppos].p_depth = i; path[ppos].p_ext = NULL; bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags); if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; } eh = ext_block_hdr(bh); ppos++; path[ppos].p_bh = bh; path[ppos].p_hdr = eh; } path[ppos].p_depth = i; path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; /* find extent */ ext4_ext_binsearch(inode, path + ppos, block); /* if not an empty leaf */ if (path[ppos].p_ext) path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); ext4_ext_show_path(inode, path); return path; err: ext4_free_ext_path(path); if (orig_path) *orig_path = NULL; return ERR_PTR(ret); } /* * ext4_ext_insert_index: * insert new index [@logical;@ptr] into the block at @curp; * check where to insert: before @curp or after @curp */ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, struct ext4_ext_path *curp, int logical, ext4_fsblk_t ptr) { struct ext4_extent_idx *ix; int len, err; err = ext4_ext_get_access(handle, inode, curp); if (err) return err; if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { EXT4_ERROR_INODE(inode, "logical %d == ei_block %d!", logical, le32_to_cpu(curp->p_idx->ei_block)); return -EFSCORRUPTED; } if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) >= le16_to_cpu(curp->p_hdr->eh_max))) { EXT4_ERROR_INODE(inode, "eh_entries %d >= eh_max %d!", le16_to_cpu(curp->p_hdr->eh_entries), le16_to_cpu(curp->p_hdr->eh_max)); return -EFSCORRUPTED; } if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ ext_debug(inode, "insert new index %d after: %llu\n", logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ ext_debug(inode, "insert new index %d before: %llu\n", logical, ptr); ix = curp->p_idx; } if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); return -EFSCORRUPTED; } len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { ext_debug(inode, "insert new index %d: " "move %d indices from 0x%p to 0x%p\n", logical, len, ix, ix + 1); memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); } ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); return -EFSCORRUPTED; } err = ext4_ext_dirty(handle, inode, curp); ext4_std_error(inode->i_sb, err); return err; } /* * ext4_ext_split: * inserts new subtree into the path, using free index entry * at depth @at: * - allocates all needed blocks (new leaf and all intermediate index blocks) * - makes decision where to split * - moves remaining extents and index entries (right to the split point) * into the newly allocated blocks * - initializes subtree */ static int ext4_ext_split(handle_t *handle, struct inode *inode, unsigned int flags, struct ext4_ext_path *path, struct ext4_extent *newext, int at) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); struct ext4_extent_header *neh; struct ext4_extent_idx *fidx; int i = at, k, m, a; ext4_fsblk_t newblock, oldblock; __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ gfp_t gfp_flags = GFP_NOFS; int err = 0; size_t ext_size = 0; if (flags & EXT4_EX_NOFAIL) gfp_flags |= __GFP_NOFAIL; /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ /* if current leaf will be split, then we should use * border from split point */ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); return -EFSCORRUPTED; } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; ext_debug(inode, "leaf will be split." " next leaf starts at %d\n", le32_to_cpu(border)); } else { border = newext->ee_block; ext_debug(inode, "leaf will be added." " next leaf starts at %d\n", le32_to_cpu(border)); } /* * If error occurs, then we break processing * and mark filesystem read-only. index won't * be inserted and tree will be in consistent * state. Next mount will repair buffers too. */ /* * Get array to track all allocated blocks. * We need this to handle errors and free blocks * upon them. */ ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); if (!ablocks) return -ENOMEM; /* allocate all needed blocks */ ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err, flags); if (newblock == 0) goto cleanup; ablocks[a] = newblock; } /* initialize new leaf */ newblock = ablocks[--a]; if (unlikely(newblock == 0)) { EXT4_ERROR_INODE(inode, "newblock == 0!"); err = -EFSCORRUPTED; goto cleanup; } bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = 0; neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; neh->eh_generation = 0; /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max)) { EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", path[depth].p_hdr->eh_entries, path[depth].p_hdr->eh_max); err = -EFSCORRUPTED; goto cleanup; } /* start copy from next extent */ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; ext4_ext_show_move(inode, path, newblock, depth); if (m) { struct ext4_extent *ex; ex = EXT_FIRST_EXTENT(neh); memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old leaf */ if (m) { err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto cleanup; } /* create intermediate indexes */ k = depth - at - 1; if (unlikely(k < 0)) { EXT4_ERROR_INODE(inode, "k %d < 0!", k); err = -EFSCORRUPTED; goto cleanup; } if (k) ext_debug(inode, "create %d intermediate indices\n", k); /* insert new index into current index block */ /* current depth stored in i var */ i = depth - 1; while (k--) { oldblock = newblock; newblock = ablocks[--a]; bh = sb_getblk(inode->i_sb, newblock); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; } lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto cleanup; neh = ext_block_hdr(bh); neh->eh_entries = cpu_to_le16(1); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); neh->eh_generation = 0; fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); /* move remainder of path[i] to the new index block */ if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != EXT_LAST_INDEX(path[i].p_hdr))) { EXT4_ERROR_INODE(inode, "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", le32_to_cpu(path[i].p_ext->ee_block)); err = -EFSCORRUPTED; goto cleanup; } /* start copy indexes */ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); ext4_ext_show_move(inode, path, newblock, i); if (m) { memmove(++fidx, path[i].p_idx, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } /* zero out unused area in the extent block */ ext_size = sizeof(struct ext4_extent_header) + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); bh = NULL; /* correct old index */ if (m) { err = ext4_ext_get_access(handle, inode, path + i); if (err) goto cleanup; le16_add_cpu(&path[i].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + i); if (err) goto cleanup; } i--; } /* insert new index */ err = ext4_ext_insert_index(handle, inode, path + at, le32_to_cpu(border), newblock); cleanup: if (bh) { if (buffer_locked(bh)) unlock_buffer(bh); brelse(bh); } if (err) { /* free all allocated blocks in error case */ for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, EXT4_FREE_BLOCKS_METADATA); } } kfree(ablocks); return err; } /* * ext4_ext_grow_indepth: * implements tree growing procedure: * - allocates new block * - moves top-level data (index block or leaf) into the new block * - initializes new top-level, creating index that points to the * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; ext4_fsblk_t newblock, goal = 0; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; size_t ext_size = 0; /* Try to prepend new index to old one */ if (ext_depth(inode)) goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); if (goal > le32_to_cpu(es->s_first_data_block)) { flags |= EXT4_MB_HINT_TRY_GOAL; goal--; } else goal = ext4_inode_to_goal_block(inode); newblock = ext4_new_meta_blocks(handle, inode, goal, flags, NULL, &err); if (newblock == 0) return err; bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto out; } ext_size = sizeof(EXT4_I(inode)->i_data); /* move top-level index/leaf into new block */ memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); /* zero out unused area in the extent block */ memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); /* set size of new block */ neh = ext_block_hdr(bh); /* old root could have indexes or leaves * so calculate e_max right way */ if (ext_depth(inode)) neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); else neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); set_buffer_verified(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; /* Update top-level index: num,max,pointer */ neh = ext_inode_hdr(inode); neh->eh_entries = cpu_to_le16(1); ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); if (neh->eh_depth == 0) { /* Root extent block becomes index block */ neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block; } ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); le16_add_cpu(&neh->eh_depth, 1); err = ext4_mark_inode_dirty(handle, inode); out: brelse(bh); return err; } /* * ext4_ext_create_new_leaf: * finds empty index and adds new leaf. * if no free index is found, then it requests in-depth growing. */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, unsigned int mb_flags, unsigned int gb_flags, struct ext4_ext_path **ppath, struct ext4_extent *newext) { struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; repeat: i = depth = ext_depth(inode); /* walk up to the tree and look for free index entry */ curp = path + depth; while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { i--; curp--; } /* we use already allocated block for index block, * so subsequent data blocks should be contiguous */ if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) goto out; /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto out; /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; } /* * only first (depth 0 -> 1) produces free space; * in all other cases we have to split the grown tree */ depth = ext_depth(inode); if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { /* now we need to split */ goto repeat; } } out: return err; } /* * search the closest allocated block to the left for *logical * and returns it at @logical + it's physical address at @phys * if *logical is the smallest allocated block, the function * returns 0 at @phys * return value contains 0 (success) or error code */ static int ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys) { struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth, ee_len; if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; /* usually extent in the path covers blocks smaller * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", *logical, le32_to_cpu(ex->ee_block)); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", ix != NULL ? le32_to_cpu(ix->ei_block) : 0, le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block), depth); return -EFSCORRUPTED; } } return 0; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; *phys = ext4_ext_pblock(ex) + ee_len - 1; return 0; } /* * Search the closest allocated block to the right for *logical * and returns it at @logical + it's physical address at @phys. * If not exists, return 0 and @phys is set to 0. We will return * 1 which means we found an allocated block and ret_ex is valid. * Or return a (< 0) error code. */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, struct ext4_extent *ret_ex) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; struct ext4_extent_idx *ix; struct ext4_extent *ex; int depth; /* Note, NOT eh_depth; depth from top of tree */ int ee_len; if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; if (depth == 0 && path->p_ext == NULL) return 0; /* usually extent in the path covers blocks smaller * then *logical, but it can be that extent is the * first one in the file */ ex = path[depth].p_ext; ee_len = ext4_ext_get_actual_len(ex); if (*logical < le32_to_cpu(ex->ee_block)) { if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { EXT4_ERROR_INODE(inode, "first_extent(path[%d].p_hdr) != ex", depth); return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "ix != EXT_FIRST_INDEX *logical %d!", *logical); return -EFSCORRUPTED; } } goto found_extent; } if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); return -EFSCORRUPTED; } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { /* next allocated block in this leaf */ ex++; goto found_extent; } /* go up and search for index to the right */ while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) goto got_index; } /* we've gone up to the root and found no index to the right */ return 0; got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ix = EXT_FIRST_INDEX(eh); put_bh(bh); } bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); ex = EXT_FIRST_EXTENT(eh); found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); if (ret_ex) *ret_ex = *ex; if (bh) put_bh(bh); return 1; } /* * ext4_ext_next_allocated_block: * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. * NOTE: it considers block number from index entry as * allocated block. Thus, index entries have to be consistent * with leaves. */ ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; if (depth == 0 && path->p_ext == NULL) return EXT_MAX_BLOCKS; while (depth >= 0) { struct ext4_ext_path *p = &path[depth]; if (depth == path->p_depth) { /* leaf */ if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) return le32_to_cpu(p->p_ext[1].ee_block); } else { /* index */ if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) return le32_to_cpu(p->p_idx[1].ei_block); } depth--; } return EXT_MAX_BLOCKS; } /* * ext4_ext_next_leaf_block: * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) { int depth; BUG_ON(path == NULL); depth = path->p_depth; /* zero-tree has no leaf blocks at all */ if (depth == 0) return EXT_MAX_BLOCKS; /* go to index block */ depth--; while (depth >= 0) { if (path[depth].p_idx != EXT_LAST_INDEX(path[depth].p_hdr)) return (ext4_lblk_t) le32_to_cpu(path[depth].p_idx[1].ei_block); depth--; } return EXT_MAX_BLOCKS; } /* * ext4_ext_correct_indexes: * if leaf gets modified and modified extent is first in the leaf, * then we have to correct all indexes above. * TODO: do we need to correct tree in all cases? */ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { struct ext4_extent_header *eh; int depth = ext_depth(inode); struct ext4_extent *ex; __le32 border; int k, err = 0; eh = path[depth].p_hdr; ex = path[depth].p_ext; if (unlikely(ex == NULL || eh == NULL)) { EXT4_ERROR_INODE(inode, "ex %p == NULL or eh %p == NULL", ex, eh); return -EFSCORRUPTED; } if (depth == 0) { /* there is no tree at all */ return 0; } if (ex != EXT_FIRST_EXTENT(eh)) { /* we correct tree if first leaf got modified only */ return 0; } /* * TODO: we need correction if border is smaller than current one */ k = depth - 1; border = path[depth].p_ext->ee_block; err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; while (k--) { /* change all left-side indexes */ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) break; err = ext4_ext_get_access(handle, inode, path + k); if (err) break; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) break; } return err; } static int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); ext2_ee_len = ext4_ext_get_actual_len(ex2); if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != le32_to_cpu(ex2->ee_block)) return 0; if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; if (ext4_ext_is_unwritten(ex1) && ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; #endif if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) return 1; return 0; } /* * This function tries to merge the "ex" extent to the next extent in the tree. * It always tries to merge towards right. If you want to merge towards * left, pass "ex - 1" as argument instead of "ex". * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns * 1 if they got merged. */ static int ext4_ext_try_to_merge_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth, len; int merge_done = 0, unwritten; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; while (ex < EXT_LAST_EXTENT(eh)) { if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); if (unwritten) ext4_ext_mark_unwritten(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) * sizeof(struct ext4_extent); memmove(ex + 1, ex + 2, len); } le16_add_cpu(&eh->eh_entries, -1); merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); } return merge_done; } /* * This function does a very simple check to see if we can collapse * an extent tree with a single extent tree leaf block into the inode. */ static void ext4_ext_try_to_merge_up(handle_t *handle, struct inode *inode, struct ext4_ext_path *path) { size_t s; unsigned max_root = ext4_ext_space_root(inode, 0); ext4_fsblk_t blk; if ((path[0].p_depth != 1) || (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) return; /* * We need to modify the block allocation bitmap and the block * group descriptor to release the extent tree block. If we * can't get the journal credits, give up. */ if (ext4_journal_extend(handle, 2, ext4_free_metadata_revoke_credits(inode->i_sb, 1))) return; /* * Copy the extent data up to the inode */ blk = ext4_idx_pblock(path[0].p_idx); s = le16_to_cpu(path[1].p_hdr->eh_entries) * sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); path[0].p_hdr->eh_max = cpu_to_le16(max_root); brelse(path[1].p_bh); ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* * This function tries to merge the @ex extent to neighbours in the tree, then * tries to collapse the extent tree into the inode. */ static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) { struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); eh = path[depth].p_hdr; if (ex > EXT_FIRST_EXTENT(eh)) merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) (void) ext4_ext_try_to_merge_right(inode, path, ex); ext4_ext_try_to_merge_up(handle, inode, path); } /* * check if a portion of the "newext" extent overlaps with an * existing extent. * * If there is an overlap discovered, it updates the length of the newext * such that there will be no overlap, and then returns 1. * If there is no overlap found, it returns 0. */ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, struct inode *inode, struct ext4_extent *newext, struct ext4_ext_path *path) { ext4_lblk_t b1, b2; unsigned int depth, len1; unsigned int ret = 0; b1 = le32_to_cpu(newext->ee_block); len1 = ext4_ext_get_actual_len(newext); depth = ext_depth(inode); if (!path[depth].p_ext) goto out; b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path * is before the requested block(s) */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ if (b1 + len1 < b1) { len1 = EXT_MAX_BLOCKS - b1; newext->ee_len = cpu_to_le16(len1); ret = 1; } /* check for overlap */ if (b1 + len1 > b2) { newext->ee_len = cpu_to_le16(b2 - b1); ret = 1; } out: return ret; } /* * ext4_ext_insert_extent: * tries to merge requested extent into the existing extent or * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, struct ext4_extent *newext, int gb_flags) { struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; int mb_flags = 0, unwritten; if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); return -EFSCORRUPTED; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EFSCORRUPTED; } /* try to insert block into found extent and return */ if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { /* * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ if (ex < EXT_LAST_EXTENT(eh) && (le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex) < le32_to_cpu(newext->ee_block))) { ex += 1; goto prepend; } else if ((ex > EXT_FIRST_EXTENT(eh)) && (le32_to_cpu(newext->ee_block) + ext4_ext_get_actual_len(newext) < le32_to_cpu(ex->ee_block))) ex -= 1; /* Try to append newex to the ex */ if (ext4_can_extents_be_merged(inode, ex, newext)) { ext_debug(inode, "append [%d]%d block to %u:[%d]%d" "(from %llu)\n", ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); nearex = ex; goto merge; } prepend: /* Try to prepend newex to the ex */ if (ext4_can_extents_be_merged(inode, newext, ex)) { ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); if (unwritten) ext4_ext_mark_unwritten(ex); nearex = ex; goto merge; } } depth = ext_depth(inode); eh = path[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) goto has_space; /* probably next leaf has space for us? */ fex = EXT_LAST_EXTENT(eh); next = EXT_MAX_BLOCKS; if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { ext_debug(inode, "next leaf block - %u\n", next); BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; goto has_space; } ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); } /* * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags |= EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, ppath, newext); if (err) goto cleanup; depth = ext_depth(inode); eh = path[depth].p_hdr; has_space: nearex = path[depth].p_ext; err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; if (!nearex) { /* there is no extent in this leaf, create first one */ ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext)); nearex = EXT_FIRST_EXTENT(eh); } else { if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ ext_debug(inode, "insert %u:%llu:[%d]%d before: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); nearex++; } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); ext_debug(inode, "insert %u:%llu:[%d]%d after: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), nearex); } len = EXT_LAST_EXTENT(eh) - nearex + 1; if (len > 0) { ext_debug(inode, "insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), len, nearex, nearex + 1); memmove(nearex + 1, nearex, len * sizeof(struct ext4_extent)); } } le16_add_cpu(&eh->eh_entries, 1); path[depth].p_ext = nearex; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); nearex->ee_len = newext->ee_len; merge: /* try to merge extents */ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) goto cleanup; err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: ext4_free_ext_path(npath); return err; } static int ext4_fill_es_cache_info(struct inode *inode, ext4_lblk_t block, ext4_lblk_t num, struct fiemap_extent_info *fieinfo) { ext4_lblk_t next, end = block + num - 1; struct extent_status es; unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; unsigned int flags; int err; while (block <= end) { next = 0; flags = 0; if (!ext4_es_lookup_extent(inode, block, &next, &es)) break; if (ext4_es_is_unwritten(&es)) flags |= FIEMAP_EXTENT_UNWRITTEN; if (ext4_es_is_delayed(&es)) flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); if (ext4_es_is_hole(&es)) flags |= EXT4_FIEMAP_EXTENT_HOLE; if (next == 0) flags |= FIEMAP_EXTENT_LAST; if (flags & (FIEMAP_EXTENT_DELALLOC| EXT4_FIEMAP_EXTENT_HOLE)) es.es_pblk = 0; else es.es_pblk = ext4_es_pblock(&es); err = fiemap_fill_next_extent(fieinfo, (__u64)es.es_lblk << blksize_bits, (__u64)es.es_pblk << blksize_bits, (__u64)es.es_len << blksize_bits, flags); if (next == 0) break; block = next; if (err < 0) return err; if (err == 1) return 0; } return 0; } /* * ext4_ext_determine_hole - determine hole around given block * @inode: inode we lookup in * @path: path in extent tree to @lblk * @lblk: pointer to logical block around which we want to determine hole * * Determine hole length (and start if easily possible) around given logical * block. We don't try too hard to find the beginning of the hole but @path * actually points to extent before @lblk, we provide it. * * The function returns the length of a hole starting at @lblk. We update @lblk * to the beginning of the hole if we managed to find it. */ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *lblk) { int depth = ext_depth(inode); struct ext4_extent *ex; ext4_lblk_t len; ex = path[depth].p_ext; if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ *lblk = 0; len = EXT_MAX_BLOCKS; } else if (*lblk < le32_to_cpu(ex->ee_block)) { len = le32_to_cpu(ex->ee_block) - *lblk; } else if (*lblk >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); next = ext4_ext_next_allocated_block(path); BUG_ON(next == *lblk); len = next - *lblk; } else { BUG(); } return len; } /* * ext4_ext_put_gap_in_cache: * calculate boundaries of the gap that the requested block fits into * and cache this gap */ static void ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, ext4_lblk_t hole_len) { struct extent_status es; ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start, hole_start + hole_len - 1, &es); if (es.es_len) { /* There's delayed extent containing lblock? */ if (es.es_lblk <= hole_start) return; hole_len = min(es.es_lblk - hole_start, hole_len); } ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); ext4_es_insert_extent(inode, hole_start, hole_len, ~0, EXTENT_STATUS_HOLE); } /* * ext4_ext_rm_idx: * removes index from the index block. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, int depth) { int err; ext4_fsblk_t leaf; /* free index block */ depth--; path = path + depth; leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); return -EFSCORRUPTED; } err = ext4_ext_get_access(handle, inode, path); if (err) return err; if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; len *= sizeof(struct ext4_extent_idx); memmove(path->p_idx, path->p_idx + 1, len); } le16_add_cpu(&path->p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path); if (err) return err; ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); trace_ext4_ext_rm_idx(inode, leaf); ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); while (--depth >= 0) { if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) break; path--; err = ext4_ext_get_access(handle, inode, path); if (err) break; path->p_idx->ei_block = (path+1)->p_idx->ei_block; err = ext4_ext_dirty(handle, inode, path); if (err) break; } return err; } /* * ext4_ext_calc_credits_for_single_extent: * This routine returns max. credits that needed to insert an extent * to the extent tree. * When pass the actual path, the caller should calculate credits * under i_data_sem. */ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { if (path) { int depth = ext_depth(inode); int ret = 0; /* probably there is space in leaf? */ if (le16_to_cpu(path[depth].p_hdr->eh_entries) < le16_to_cpu(path[depth].p_hdr->eh_max)) { /* * There are some space in the leaf tree, no * need to account for leaf block credit * * bitmaps and block group descriptor blocks * and other metadata blocks still need to be * accounted. */ /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } } return ext4_chunk_trans_blocks(inode, nrblocks); } /* * How many index/leaf blocks need to change/allocate to add @extents extents? * * If we add a single extent, then in the worse case, each tree level * index/leaf need to be changed in case of the tree split. * * If more extents are inserted, they could cause the whole tree split more * than once, but this is really rare. */ int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; depth = ext_depth(inode); if (extents <= 1) index = depth * 2; else index = depth * 3; return index; } static inline int get_default_free_blocks_flags(struct inode *inode) { if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; else if (ext4_should_journal_data(inode)) return EXT4_FREE_BLOCKS_FORGET; return 0; } /* * ext4_rereserve_cluster - increment the reserved cluster count when * freeing a cluster with a pending reservation * * @inode - file containing the cluster * @lblk - logical block in cluster to be reserved * * Increments the reserved cluster count and adjusts quota in a bigalloc * file system when freeing a partial cluster containing at least one * delayed and unwritten block. A partial cluster meeting that * requirement will have a pending reservation. If so, the * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to * defer reserved and allocated space accounting to a subsequent call * to this function. */ static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); dquot_reclaim_block(inode, EXT4_C2B(sbi, 1)); spin_lock(&ei->i_block_reservation_lock); ei->i_reserved_data_blocks++; percpu_counter_add(&sbi->s_dirtyclusters_counter, 1); spin_unlock(&ei->i_block_reservation_lock); percpu_counter_add(&sbi->s_freeclusters_counter, 1); ext4_remove_pending(inode, lblk); } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, struct ext4_extent *ex, struct partial_cluster *partial, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t last_pblk, pblk; ext4_lblk_t num; int flags; /* only extent tail removal is allowed */ if (from < le32_to_cpu(ex->ee_block) || to != le32_to_cpu(ex->ee_block) + ee_len - 1) { ext4_error(sbi->s_sb, "strange request: removal(2) %u-%u from %u:%u", from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } #ifdef EXTENTS_STATS spin_lock(&sbi->s_ext_stats_lock); sbi->s_ext_blocks += ee_len; sbi->s_ext_extents++; if (ee_len < sbi->s_ext_min) sbi->s_ext_min = ee_len; if (ee_len > sbi->s_ext_max) sbi->s_ext_max = ee_len; if (ext_depth(inode) > sbi->s_depth_max) sbi->s_depth_max = ext_depth(inode); spin_unlock(&sbi->s_ext_stats_lock); #endif trace_ext4_remove_blocks(inode, ex, from, to, partial); /* * if we have a partial cluster, and it's different from the * cluster of the last block in the extent, we free it */ last_pblk = ext4_ext_pblock(ex) + ee_len - 1; if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, last_pblk)) { if (partial->state == tofree) { flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial->lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial->pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial->lblk); } partial->state = initial; } num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; /* * We free the partial cluster at the end of the extent (if any), * unless the cluster is used by another extent (partial_cluster * state is nofree). If a partial cluster exists here, it must be * shared with the last block in the extent. */ flags = get_default_free_blocks_flags(inode); /* partial, left end cluster aligned, right end unaligned */ if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) && (EXT4_LBLK_CMASK(sbi, to) >= from) && (partial->state != nofree)) { if (ext4_is_pending(inode, to)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_PBLK_CMASK(sbi, last_pblk), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, to); partial->state = initial; flags = get_default_free_blocks_flags(inode); } flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; /* * For bigalloc file systems, we never free a partial cluster * at the beginning of the extent. Instead, we check to see if we * need to free it on a subsequent call to ext4_remove_blocks, * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; ext4_free_blocks(handle, inode, NULL, pblk, num, flags); /* reset the partial cluster if we've freed past it */ if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk)) partial->state = initial; /* * If we've freed the entire extent but the beginning is not left * cluster aligned and is not marked as ineligible for freeing we * record the partial cluster at the beginning of the extent. It * wasn't freed by the preceding ext4_free_blocks() call, and we * need to look farther to the left to determine if it's to be freed * (not shared with another extent). Else, reset the partial * cluster - we're either done freeing or the beginning of the * extent is left cluster aligned. */ if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) { if (partial->state == initial) { partial->pclu = EXT4_B2C(sbi, pblk); partial->lblk = from; partial->state = tofree; } } else { partial->state = initial; } return 0; } /* * ext4_ext_rm_leaf() Removes the extents associated with the * blocks appearing between "start" and "end". Both "start" * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf * @partial_cluster: The cluster which we'll have to free if all extents * has been released from it. However, if this value is * negative, it's a cluster just to the right of the * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ static int ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct partial_cluster *partial, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err = 0, correct_index = 0; int depth = ext_depth(inode), credits, revoke_credits; struct ext4_extent_header *eh; ext4_lblk_t a, b; unsigned num; ext4_lblk_t ex_ee_block; unsigned short ex_ee_len; unsigned unwritten = 0; struct ext4_extent *ex; ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); return -EFSCORRUPTED; } /* find where to start removing */ ex = path[depth].p_ext; if (!ex) ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_rm_leaf(inode, start, ex, partial); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { if (ext4_ext_is_unwritten(ex)) unwritten = 1; else unwritten = 0; ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, unwritten, ex_ee_len); path[depth].p_ext = ex; a = max(ex_ee_block, start); b = min(ex_ee_block + ex_ee_len - 1, end); ext_debug(inode, " border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { /* * We're going to skip this extent and move to another, * so note that its first cluster is in use to avoid * freeing it when removing blocks. Eventually, the * right edge of the truncated/punched region will * be just to the left. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex); partial->pclu = EXT4_B2C(sbi, pblk); partial->state = nofree; } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); continue; } else if (b != ex_ee_block + ex_ee_len - 1) { EXT4_ERROR_INODE(inode, "can not handle truncate %u:%u " "on extent %u:%u", start, end, ex_ee_block, ex_ee_block + ex_ee_len - 1); err = -EFSCORRUPTED; goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ num = a - ex_ee_block; } else { /* remove whole extent: excellent! */ num = 0; } /* * 3 for leaf, sb, and inode plus 2 (bmap and group * descriptor) for each block group; assume two block * groups plus ex_ee_len/blocks_per_block_group for * the worst case */ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); if (ex == EXT_FIRST_EXTENT(eh)) { correct_index = 1; credits += (ext_depth(inode)) + 1; } credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); /* * We may end up freeing some index blocks and data from the * punched range. Note that partial clusters are accounted for * by ext4_free_data_revoke_credits(). */ revoke_credits = ext4_free_metadata_revoke_credits(inode->i_sb, ext_depth(inode)) + ext4_free_data_revoke_credits(inode, b - a + 1); err = ext4_datasem_ensure_credits(handle, inode, credits, credits, revoke_credits); if (err) { if (err > 0) err = -EAGAIN; goto out; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; err = ext4_remove_blocks(handle, inode, ex, partial, a, b); if (err) goto out; if (num == 0) /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); ex->ee_len = cpu_to_le16(num); /* * Do not mark unwritten if all the blocks in the * extent have been removed. */ if (unwritten && num) ext4_ext_mark_unwritten(ex); /* * If the extent was completely released, * we need to remove it from the leaf */ if (num == 0) { if (end != EXT_MAX_BLOCKS - 1) { /* * For hole punching, we need to scoot all the * extents up when an extent is removed so that * we dont have blank extents in the middle */ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * sizeof(struct ext4_extent)); /* Now get rid of the one at the end */ memset(EXT_LAST_EXTENT(eh), 0, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); } if (correct_index && eh->eh_entries) err = ext4_ext_correct_indexes(handle, inode, path); /* * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the * current extent. If it is shared with the current extent * we reset the partial cluster because we've reached the start of the * truncated/punched region and we're done removing blocks. */ if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) { pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; if (partial->pclu != EXT4_B2C(sbi, pblk)) { int flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial->lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial->pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial->lblk); } partial->state = initial; } /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) err = ext4_ext_rm_idx(handle, inode, path, depth); out: return err; } /* * ext4_ext_more_to_rm: * returns 1 if current index has to be freed (even partial) */ static int ext4_ext_more_to_rm(struct ext4_ext_path *path) { BUG_ON(path->p_idx == NULL); if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) return 0; /* * if truncate on deeper level happened, it wasn't partial, * so we have to consider current index for truncation */ if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) return 0; return 1; } int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; partial.pclu = 0; partial.lblk = 0; partial.state = initial; ext_debug(inode, "truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, depth + 1, ext4_free_metadata_revoke_credits(inode->i_sb, depth)); if (IS_ERR(handle)) return PTR_ERR(handle); again: trace_ext4_ext_remove_space(inode, start, end, depth); /* * Check if we are removing extents inside the extent tree. If that * is the case, we are going to punch a hole inside the extent tree * so we have to check whether we need to split the extent covering * the last block to remove so we can easily remove the part of it * in ext4_ext_rm_leaf(). */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; ext4_lblk_t ee_block, ex_end, lblk; ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); } depth = ext_depth(inode); /* Leaf not may not exist only if inode has no blocks at all */ ex = path[depth].p_ext; if (!ex) { if (depth) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EFSCORRUPTED; } goto out; } ee_block = le32_to_cpu(ex->ee_block); ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; /* * See if the last block is inside the extent, if so split * the extent at 'end' block so we can easily remove the * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ if (end >= ee_block && end < ex_end) { /* * If we're going to split the extent, note that * the cluster containing the block after 'end' is * in use to avoid freeing it when removing blocks. */ if (sbi->s_cluster_ratio > 1) { pblk = ext4_ext_pblock(ex) + end - ee_block + 1; partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ err = ext4_force_split_extent_at(handle, inode, &path, end + 1, 1); if (err < 0) goto out; } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && partial.state == initial) { /* * If we're punching, there's an extent to the right. * If the partial cluster hasn't been set, set it to * that extent's first cluster and its state to nofree * so it won't be freed should it contain blocks to be * removed. If it's already set (tofree/nofree), we're * retrying and keep the original partial cluster info * so a cluster marked tofree as a result of earlier * extent removal is not lost. */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, NULL); if (err < 0) goto out; if (pblk) { partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } } } /* * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ depth = ext_depth(inode); if (path) { int k = i = depth; while (--k > 0) path[k].p_block = le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS | __GFP_NOFAIL); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; } path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { err = -EFSCORRUPTED; goto out; } } err = 0; while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ err = ext4_ext_rm_leaf(handle, inode, path, &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; i--; continue; } /* this is index block */ if (!path[i].p_hdr) { ext_debug(inode, "initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); } if (!path[i].p_idx) { /* this level hasn't been touched yet */ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", path[i].p_hdr, le16_to_cpu(path[i].p_hdr->eh_entries)); } else { /* we were already here, see at next index */ path[i].p_idx--; } ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { struct buffer_head *bh; /* go to the next level */ ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, depth - i - 1, EXT4_EX_NOCACHE); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); break; } /* Yield here to deal with large extent trees. * Should be a no-op if we did IO above. */ cond_resched(); if (WARN_ON(i + 1 > depth)) { err = -EFSCORRUPTED; break; } path[i + 1].p_bh = bh; /* save actual number of indexes since this * number is changed at the next iteration */ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); i++; } else { /* we finished processing this index, go up */ if (path[i].p_hdr->eh_entries == 0 && i > 0) { /* index is empty, remove it; * handle must be already prepared by the * truncatei_leaf() */ err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); path[i].p_bh = NULL; i--; ext_debug(inode, "return to level %d\n", i); } } trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial, path->p_hdr->eh_entries); /* * if there's a partial cluster and we have removed the first extent * in the file, then we also free the partial cluster, if any */ if (partial.state == tofree && err == 0) { int flags = get_default_free_blocks_flags(inode); if (ext4_is_pending(inode, partial.lblk)) flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER; ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, partial.pclu), sbi->s_cluster_ratio, flags); if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER) ext4_rereserve_cluster(inode, partial.lblk); partial.state = initial; } /* TODO: flexible tree reduction should be here */ if (path->p_hdr->eh_entries == 0) { /* * truncate to zero freed all the tree, * so we need to correct eh_depth */ err = ext4_ext_get_access(handle, inode, path); if (err == 0) { ext_inode_hdr(inode)->eh_depth = 0; ext_inode_hdr(inode)->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); err = ext4_ext_dirty(handle, inode, path); } } out: ext4_free_ext_path(path); path = NULL; if (err == -EAGAIN) goto again; ext4_journal_stop(handle); return err; } /* * called at mount time */ void ext4_ext_init(struct super_block *sb) { /* * possible initialization would be here */ if (ext4_has_feature_extents(sb)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST ", aggressive tests" #endif #ifdef CHECK_BINSEARCH ", check binsearch" #endif #ifdef EXTENTS_STATS ", stats" #endif "\n"); #endif #ifdef EXTENTS_STATS spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); EXT4_SB(sb)->s_ext_min = 1 << 30; EXT4_SB(sb)->s_ext_max = 0; #endif } } /* * called at umount time */ void ext4_ext_release(struct super_block *sb) { if (!ext4_has_feature_extents(sb)) return; #ifdef EXTENTS_STATS if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { struct ext4_sb_info *sbi = EXT4_SB(sb); printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", sbi->s_ext_blocks, sbi->s_ext_extents, sbi->s_ext_blocks / sbi->s_ext_extents); printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); } #endif } static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) { ext4_lblk_t ee_block; ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, EXTENT_STATUS_WRITTEN); } /* FIXME!! we need to try to merge to left or right after zero-out */ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, ee_len); } /* * ext4_split_extent_at() splits an extent at given block. * * @handle: the journal handle * @inode: the file inode * @path: the path to the extent * @split: the logical block where the extent is splitted. * @split_flags: indicates if the extent could be zeroout if split fails, and * the states(init or unwritten) of new extents. * @flags: flags used to insert new extent to extent tree. * * * Splits extent [a, b] into two extents [a, @split) and [@split, b], states * of which are determined by split_flag. * * There are two cases: * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * * return 0 on success. */ static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags) { struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); ext_debug(inode, "logical block %llu\n", (unsigned long long)split); ext4_ext_show_leaf(inode, path); depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); BUG_ON(!ext4_ext_is_unwritten(ex) && split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; if (split == ee_block) { /* * case b: block @split is the block that the extent begins with * then we just change the state of the extent, and splitting * is not needed. */ if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); goto out; } /* case a */ memcpy(&orig_ex, ex, sizeof(orig_ex)); ex->ee_len = cpu_to_le16(split - ee_block); if (split_flag & EXT4_EXT_MARK_UNWRIT1) ext4_ext_mark_unwritten(ex); /* * path may lead to new leaf, not to original leaf any more * after ext4_ext_insert_extent() returns, */ err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto fix_extent_len; ex2 = &newex; ex2->ee_block = cpu_to_le32(split); ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); ext4_ext_store_pblock(ex2, newblock); if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) goto out; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); zero_ex.ee_block = ex2->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex2)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex2)); } else { err = ext4_ext_zeroout(inode, ex); zero_ex.ee_block = ex->ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); } } else { err = ext4_ext_zeroout(inode, &orig_ex); zero_ex.ee_block = orig_ex.ee_block; zero_ex.ee_len = cpu_to_le16( ext4_ext_get_actual_len(&orig_ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(&orig_ex)); } if (!err) { /* update the extent length and mark as initialized */ ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (!err) /* update extent status tree */ ext4_zeroout_es(inode, &zero_ex); /* If we failed at this point, we don't know in which * state the extent tree exactly is so don't try to fix * length of the original extent as it may do even more * damage. */ goto out; } } fix_extent_len: ex->ee_len = orig_ex.ee_len; /* * Ignore ext4_ext_dirty return value since we are already in error path * and err is a non-zero error code. */ ext4_ext_dirty(handle, inode, path + path->p_depth); return err; out: ext4_ext_show_leaf(inode, path); return err; } /* * ext4_split_extents() splits an extent and mark extent which is covered * by @map as split_flags indicates * * It may result in splitting the extent into multiple extents (up to three) * There are three possibilities: * a> There is no split required * b> Splits in two extents: Split is happening at either end of the extent * c> Splits in three extents: Somone is splitting in middle of the extent * */ static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags) { struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; int err = 0; int unwritten; int split_flag1, flags1; int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk + map->m_len < ee_block + ee_len) { split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; if (unwritten) split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; } else { allocated = ee_len - (map->m_lblk - ee_block); } /* * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ path = ext4_find_extent(inode, map->m_lblk, ppath, flags); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); return -EFSCORRUPTED; } unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; if (unwritten) { split_flag1 |= EXT4_EXT_MARK_UNWRIT1; split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk, split_flag1, flags); if (err) goto out; } ext4_ext_show_leaf(inode, path); out: return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() if someone tries to write * to an unwritten extent. It may result in splitting the unwritten * extent into multiple extents (up to three - one initialized and two * unwritten). * There are three possibilities: * a> There is no split required: Entire extent should be initialized * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * Pre-conditions: * - The extent pointed to by 'path' is unwritten. * - The extent pointed to by 'path' contains a superset * of the logical span [map->m_lblk, map->m_lblk + map->m_len). * * Post-conditions on success: * - the returned value is the number of blocks beyond map->l_lblk * that are allocated and initialized. * It is guaranteed to be >= map->m_len. */ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags) { struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; struct ext4_extent zero_ex1, zero_ex2; struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map_len) eof_block = map->m_lblk + map_len; depth = ext_depth(inode); eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); zero_ex1.ee_len = 0; zero_ex2.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); /* Pre-conditions */ BUG_ON(!ext4_ext_is_unwritten(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); /* * Attempt to transfer newly initialized blocks from the currently * unwritten extent to its neighbor. This is much cheaper * than an insertion followed by a merge as those involve costly * memmove() calls. Transferring to the left is the common case in * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) * followed by append writes. * * Limitations of the current logic: * - L1: we do not deal with writes covering the whole extent. * This would require removing the extent if the transfer * is possible. * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ if ((map->m_lblk == ee_block) && /* See if we can merge left */ (map_len < ee_len) && /*L1*/ (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ ext4_lblk_t prev_lblk; ext4_fsblk_t prev_pblk, ee_pblk; unsigned int prev_len; abut_ex = ex - 1; prev_lblk = le32_to_cpu(abut_ex->ee_block); prev_len = ext4_ext_get_actual_len(abut_ex); prev_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((prev_lblk + prev_len) == ee_block) && /*C2*/ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); /* Shift the start of ex by 'map_len' blocks */ ex->ee_block = cpu_to_le32(ee_block + map_len); ext4_ext_store_pblock(ex, ee_pblk + map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(prev_len + map_len); /* Result: number of initialized blocks past m_lblk */ allocated = map_len; } } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && (map_len < ee_len) && /*L1*/ ex < EXT_LAST_EXTENT(eh)) { /*L2*/ /* See if we can merge right */ ext4_lblk_t next_lblk; ext4_fsblk_t next_pblk, ee_pblk; unsigned int next_len; abut_ex = ex + 1; next_lblk = le32_to_cpu(abut_ex->ee_block); next_len = ext4_ext_get_actual_len(abut_ex); next_pblk = ext4_ext_pblock(abut_ex); ee_pblk = ext4_ext_pblock(ex); /* * A transfer of blocks from 'ex' to 'abut_ex' is allowed * upon those conditions: * - C1: abut_ex is initialized, * - C2: abut_ex is logically abutting ex, * - C3: abut_ex is physically abutting ex, * - C4: abut_ex can receive the additional blocks without * overflowing the (initialized) length limit. */ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ ((map->m_lblk + map_len) == next_lblk) && /*C2*/ ((ee_pblk + ee_len) == next_pblk) && /*C3*/ (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); /* Shift the start of abut_ex by 'map_len' blocks */ abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); ext4_ext_store_pblock(abut_ex, next_pblk - map_len); ex->ee_len = cpu_to_le16(ee_len - map_len); ext4_ext_mark_unwritten(ex); /* Restore the flag */ /* Extend abut_ex by 'map_len' blocks */ abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ allocated = map_len; } } if (allocated) { /* Mark the block containing both extents as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; goto out; } else allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); /* * five cases: * 1. split the extent into three extents. * 2. split the extent into two extents, zeroout the head of the first * extent. * 3. split the extent into two extents, zeroout the tail of the second * extent. * 4. split the extent into two extents with out zeroout. * 5. no splitting needed, just possibly zeroout the head and / or the * tail of the extent. */ split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; if (max_zeroout && (allocated > split_map.m_len)) { if (allocated <= max_zeroout) { /* case 3 or 5 */ zero_ex1.ee_block = cpu_to_le32(split_map.m_lblk + split_map.m_len); zero_ex1.ee_len = cpu_to_le16(allocated - split_map.m_len); ext4_ext_store_pblock(&zero_ex1, ext4_ext_pblock(ex) + split_map.m_lblk + split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < max_zeroout) { /* case 2 or 5 */ if (split_map.m_lblk != ee_block) { zero_ex2.ee_block = ex->ee_block; zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - ee_block); ext4_ext_store_pblock(&zero_ex2, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; allocated = map->m_len; } } fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) err = 0; out: /* If we have gotten a failure, don't zero out status tree */ if (!err) { ext4_zeroout_es(inode, &zero_ex1); ext4_zeroout_es(inode, &zero_ex2); } return err ? err : allocated; } /* * This function is called by ext4_ext_map_blocks() from * ext4_get_blocks_dio_write() when DIO to write * to an unwritten extent. * * Writing to an unwritten extent may result in splitting the unwritten * extent into multiple initialized/unwritten extents (up to three) * There are three possibilities: * a> There is no split required: Entire extent should be unwritten * b> Splits in two extents: Write is happening at either end of the extent * c> Splits in three extents: Somone is writing in middle of the extent * * This works the same way in the case of initialized -> unwritten conversion. * * One of more index blocks maybe needed if the extent tree grow after * the unwritten extent split. To prevent ENOSPC occur at the IO * complete, we need to split the unwritten extent before DIO submit * the IO. The unwritten extent called at this time will be split * into three unwritten extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * * Returns the size of unwritten extent to be written on success. */ static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags) { struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; int split_flag = 0, depth; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (eof_block < map->m_lblk + map->m_len) eof_block = map->m_lblk + map->m_len; /* * It is safe to convert extent to initialized via explicit * zeroout only if extent is fully inside i_size or new_size. */ depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); /* Convert to unwritten */ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { split_flag |= EXT4_EXT_DATA_VALID1; /* Convert to initialized */ } else if (flags & EXT4_GET_BLOCKS_CONVERT) { split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); } static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); /* If extent is larger than requested it is a clear sign that we still * have some extent state machine issues left. So extent_split is still * required. * TODO: Once all related issues will be fixed this situation should be * illegal. */ if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef CONFIG_EXT4_DEBUG ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu," " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT); if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); out: ext4_ext_show_leaf(inode, path); return err; } static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, unsigned int *allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; int depth; int err = 0; /* * Make sure that the extent is no bigger than we support with * unwritten extent */ if (map->m_len > EXT_UNWRITTEN_MAX_LEN) map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; depth = ext_depth(inode); ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); if (err < 0) return err; path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); return -EFSCORRUPTED; } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) return err; /* first mark the extent as unwritten */ ext4_ext_mark_unwritten(ex); /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ ext4_ext_try_to_merge(handle, inode, path, ex); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) return err; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); map->m_flags |= EXT4_MAP_UNWRITTEN; if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; return 0; } static int ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { struct ext4_ext_path __maybe_unused *path = *ppath; int ret = 0; int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", (unsigned long long)map->m_lblk, map->m_len, flags, allocated); ext4_ext_show_leaf(inode, path); /* * When writing into unwritten space, we should not fail to * allocate metadata blocks for the new extent block if needed. */ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { ret = ext4_split_convert_extents(handle, inode, map, ppath, flags | EXT4_GET_BLOCKS_CONVERT); if (ret < 0) { err = ret; goto out2; } /* * shouldn't get a 0 return when splitting an extent unless * m_len is 0 (bug) or extent has been corrupted */ if (unlikely(ret == 0)) { EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto out2; } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { err = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (err < 0) goto out2; ext4_update_inode_fsync_trans(handle, inode, 1); goto map_out; } /* buffered IO cases */ /* * repeat fallocate creation request * we already have an unwritten extent */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { map->m_flags |= EXT4_MAP_UNWRITTEN; goto map_out; } /* buffered READ or buffered write_begin() lookup */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * We have blocks reserved already. We * return allocated blocks so that delalloc * won't do block reservation for us. But * the buffer head will be unmapped so that * a read from the block returns 0s. */ map->m_flags |= EXT4_MAP_UNWRITTEN; goto out1; } /* * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. * For buffered writes, at writepage time, etc. Convert a * discovered unwritten extent to written. */ ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret < 0) { err = ret; goto out2; } ext4_update_inode_fsync_trans(handle, inode, 1); /* * shouldn't get a 0 return when converting an unwritten extent * unless m_len is 0 (bug) or extent has been corrupted */ if (unlikely(ret == 0)) { EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto out2; } out: allocated = ret; map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; ext4_ext_show_leaf(inode, path); out2: return err ? err : allocated; } /* * get_implied_cluster_alloc - check to see if the requested * allocation (in the map structure) overlaps with a cluster already * allocated in an extent. * @sb The filesystem superblock structure * @map The requested lblk->pblk mapping * @ex The extent structure which might contain an implied * cluster allocation * * This function is called by ext4_ext_map_blocks() after we failed to * find blocks that were already in the inode's extent tree. Hence, * we know that the beginning of the requested region cannot overlap * the extent from the inode's extent tree. There are three cases we * want to catch. The first is this case: * * |--- cluster # N--| * |--- extent ---| |---- requested region ---| * |==========| * * The second case that we need to test for is this one: * * |--------- cluster # N ----------------| * |--- requested region --| |------- extent ----| * |=======================| * * The third case is when the requested region lies between two extents * within the same cluster: * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| * * In each of the above cases, we need to set the map->m_pblk and * map->m_len so it corresponds to the return the extent labelled as * "|====|" from cluster #N, since it is already in use for data in * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to * signal to ext4_ext_map_blocks() that map->m_pblk should be treated * as a new "allocated" block region. Otherwise, we will return 0 and * ext4_ext_map_blocks() will then allocate one or more new clusters * by calling ext4_mb_new_blocks(). */ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_map_blocks *map, struct ext4_extent *ex, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len = ext4_ext_get_actual_len(ex); /* The extent passed in that we are trying to match */ ex_cluster_start = EXT4_B2C(sbi, ee_block); ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); /* The requested region passed into ext4_map_blocks() */ rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); if ((rr_cluster_start == ex_cluster_end) || (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* * Check for and handle this case: * * |--------- cluster # N-------------| * |------- extent ----| * |--- requested region ---| * |===========| */ if (map->m_lblk < ee_block) map->m_len = min(map->m_len, ee_block - map->m_lblk); /* * Check for the case where there is already another allocated * block to the right of 'ex' but before the end of the cluster. * * |------------- cluster # N-------------| * |----- ex -----| |---- ex_right ----| * |------ requested region ------| * |================| */ if (map->m_lblk > ee_block) { ext4_lblk_t next = ext4_ext_next_allocated_block(path); map->m_len = min(map->m_len, next - map->m_lblk); } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); return 1; } trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); return 0; } /* * Block allocation/map/preallocation routine for extents based files * * * Need to be called with * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) * * return > 0, number of blocks already mapped/allocated * if create == 0 and these are pre-allocated blocks * buffer head is unmapped * otherwise blocks are mapped * * return = 0, if plain look up failed (blocks have not been allocated) * buffer head is unmapped * * return < 0, error case. */ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; int err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; goto out; } depth = ext_depth(inode); /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " "lblock: %lu, depth: %d pblock %lld", (unsigned long) map->m_lblk, depth, path[depth].p_block); err = -EFSCORRUPTED; goto out; } ex = path[depth].p_ext; if (ex) { ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); ext4_fsblk_t ee_start = ext4_ext_pblock(ex); unsigned short ee_len; /* * unwritten extents are treated as holes, except that * we split out initialized portions during a write. */ ee_len = ext4_ext_get_actual_len(ex); trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); /* if found extent covers block, simply return it */ if (in_range(map->m_lblk, ee_block, ee_len)) { newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); ext_debug(inode, "%u fit into %u:%d -> %llu\n", map->m_lblk, ee_block, ee_len, newblock); /* * If the extent is initialized check whether the * caller wants to convert it to unwritten. */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { err = convert_initialized_extent(handle, inode, map, &path, &allocated); goto out; } else if (!ext4_ext_is_unwritten(ex)) { map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; ext4_ext_show_leaf(inode, path); goto out; } ret = ext4_ext_handle_unwritten_extents( handle, inode, map, &path, flags, allocated, newblock); if (ret < 0) err = ret; else allocated = ret; goto out; } } /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ext4_lblk_t hole_start, hole_len; hole_start = map->m_lblk; hole_len = ext4_ext_determine_hole(inode, path, &hole_start); /* * put just found gap into cache to speed up * subsequent requests */ ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); /* Update hole_len to reflect hole size after map->m_lblk */ if (hole_start != map->m_lblk) hole_len -= map->m_lblk - hole_start; map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, hole_len); goto out; } /* * Okay, we need to do block allocation. */ newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned * by ext4_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; } /* find neighbour allocated blocks */ ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) goto out; ar.lright = map->m_lblk; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err < 0) goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ if ((sbi->s_cluster_ratio > 1) && err && get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; } /* * See if request is beyond maximum number of blocks we can have in * a single extent. For an initialized extent this limit is * EXT_INIT_MAX_LEN and for an unwritten extent this limit is * EXT_UNWRITTEN_MAX_LEN. */ if (map->m_len > EXT_INIT_MAX_LEN && !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_INIT_MAX_LEN; else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) map->m_len = EXT_UNWRITTEN_MAX_LEN; /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ newex.ee_len = cpu_to_le16(map->m_len); err = ext4_ext_check_overlap(sbi, inode, &newex, path); if (err) allocated = ext4_ext_get_actual_len(&newex); else allocated = map->m_len; /* allocate new block */ ar.inode = inode; ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); ar.logical = map->m_lblk; /* * We calculate the offset from the beginning of the cluster * for the logical block number, since when we allocate a * physical cluster, the physical block should start at the * same offset from the beginning of the cluster. This is * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; else /* disable in-core preallocation for non-regular files */ ar.flags = 0; if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; got_allocated_blocks: /* try to insert new extent into found leaf and return */ pblk = newblock + offset; ext4_ext_store_pblock(&newex, pblk); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; } err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); if (err) { if (allocated_clusters) { int fb_flags = 0; /* * free data blocks we just allocated. * not a good idea to call discard here directly, * but otherwise we'd need to call it every free(). */ ext4_discard_preallocations(inode, 0); if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; ext4_free_blocks(handle, inode, NULL, newblock, EXT4_C2B(sbi, allocated_clusters), fb_flags); } goto out; } /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of * clusters discovered to be delayed allocated. Once allocated, a * cluster is not included in the reserved count. */ if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* * When allocating delayed allocated clusters, simply * reduce the reserved cluster count and claim quota */ ext4_da_update_reserve_space(inode, allocated_clusters, 1); } else { ext4_lblk_t lblk, len; unsigned int n; /* * When allocating non-delayed allocated clusters * (from fallocate, filemap, DIO, or clusters * allocated when delalloc has been disabled by * ext4_nonda_switch), reduce the reserved cluster * count by the number of allocated clusters that * have previously been delayed allocated. Quota * has been claimed by ext4_mb_new_blocks() above, * so release the quota reservations made for any * previously delayed allocated clusters. */ lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); len = allocated_clusters << sbi->s_cluster_bits; n = ext4_es_delayed_clu(inode, lblk, len); if (n > 0) ext4_da_update_reserve_space(inode, (int) n, 0); } } /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an unwritten extent. */ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); map->m_pblk = pblk; map->m_len = ar.len; allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); return err ? err : allocated; } int ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; int err = 0; /* * TODO: optimization is possible here. * Probably we need not scan at all, * because page truncation is enough. */ /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_mark_inode_dirty(handle, inode); if (err) return err; last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, ext4_lblk_t len, loff_t new_size, int flags) { struct inode *inode = file_inode(file); handle_t *handle; int ret = 0, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos; BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; map.m_len = len; /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple * extents. */ if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); retry: while (len) { /* * Recalculate credits when extent tree depth changes. */ if (depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { ext4_debug("inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); break; } /* * allow a full retry cycle for any remaining allocations */ retries = 0; map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); ret3 = ext4_journal_stop(handle); ret2 = ret3 ? ret3 : ret2; if (unlikely(ret2)) break; } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret > 0 ? ret2 : ret; } static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); static int ext4_insert_range(struct file *file, loff_t offset, loff_t len); static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; unsigned int max_blocks; loff_t new_size = 0; int ret = 0; int flags; int credits; int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); /* * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the * range. Here, start and partial_begin are inclusive, end and * partial_end are exclusive. */ start = round_up(offset, 1 << blkbits); end = round_down((offset + len), 1 << blkbits); if (start < offset || end > offset + len) return -EINVAL; partial_begin = offset & ((1 << blkbits) - 1); partial_end = (offset + len) & ((1 << blkbits) - 1); lblk = start >> blkbits; max_blocks = (end >> blkbits); if (max_blocks < lblk) max_blocks = 0; else max_blocks -= lblk; inode_lock(inode); /* * Indirect files do not support unwritten extents */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; goto out_mutex; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) goto out_mutex; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, round_down(offset, 1 << blkbits) >> blkbits, (round_up((offset + len), 1 << blkbits) - round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags); if (ret) goto out_mutex; } /* Zero range excluding the unaligned edges */ if (max_blocks > 0) { flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); /* * Prevent page faults from reinstantiating pages we have * released from page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } /* * For journalled data we need to write (and checkpoint) pages * before discarding page cache to avoid inconsitent data on * disk in case of crash before zeroing trans is committed. */ if (ext4_should_journal_data(inode)) { ret = filemap_write_and_wait_range(mapping, start, end - 1); if (ret) { filemap_invalidate_unlock(mapping); goto out_mutex; } } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); filemap_invalidate_unlock(mapping); if (ret) goto out_mutex; } if (!partial_begin && !partial_end) goto out_mutex; /* * In worst case we have to writeout two nonadjacent unwritten * blocks and update the inode */ credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; if (ext4_should_journal_data(inode)) credits += 2; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); goto out_mutex; } inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); out_mutex: inode_unlock(inode); return ret; } /* * preallocate space for a file. This implements ext4's fallocate file * operation, which gets called from sys_fallocate system call. * For block-mapped files, posix_fallocate should fall back to the method * of writing zeroes to the required new blocks (the same behavior which is * expected for file systems which do not support fallocate() system call). */ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); loff_t new_size = 0; unsigned int max_blocks; int ret = 0; int flags; ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; /* * Encrypted inodes can't handle collapse range or insert * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). */ if (IS_ENCRYPTED(inode) && (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_convert_inline_data(inode); inode_unlock(inode); if (ret) goto exit; if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(file, offset, len); goto exit; } if (mode & FALLOC_FL_COLLAPSE_RANGE) { ret = ext4_collapse_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_INSERT_RANGE) { ret = ext4_insert_range(file, offset, len); goto exit; } if (mode & FALLOC_FL_ZERO_RANGE) { ret = ext4_zero_range(file, offset, len, mode); goto exit; } trace_ext4_fallocate_enter(inode, offset, len, mode); lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; inode_lock(inode); /* * We only support preallocation for extent-based files only */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE) && (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) goto out; } /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out; ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); if (ret) goto out; if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } out: inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); exit: return ret; } /* * This function convert a range of blocks to written extents * The caller of this function will pass the start offset and the size. * all unwritten extents within this range will be converted to * written extents. * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. * Returns 0 on success. */ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; int ret = 0, ret2 = 0, ret3 = 0; struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); if (!handle) { /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, max_blocks); } while (ret >= 0 && ret < max_blocks) { map.m_lblk += ret; map.m_len = (max_blocks -= ret); if (credits) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); break; } } ret = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_IO_CONVERT_EXT); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); ret2 = ext4_mark_inode_dirty(handle, inode); if (credits) { ret3 = ext4_journal_stop(handle); if (unlikely(ret3)) ret2 = ret3; } if (ret <= 0 || ret2) break; } return ret > 0 ? ret2 : ret; } int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) { int ret = 0, err = 0; struct ext4_io_end_vec *io_end_vec; /* * This is somewhat ugly but the idea is clear: When transaction is * reserved, everything goes into it. Otherwise we rather start several * smaller transactions for conversion of each extent separately. */ if (handle) { handle = ext4_journal_start_reserved(handle, EXT4_HT_EXT_CONVERT); if (IS_ERR(handle)) return PTR_ERR(handle); } list_for_each_entry(io_end_vec, &io_end->list_vec, list) { ret = ext4_convert_unwritten_extents(handle, io_end->inode, io_end_vec->offset, io_end_vec->size); if (ret) break; } if (handle) err = ext4_journal_stop(handle); return ret < 0 ? ret : err; } static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) { __u64 physical = 0; __u64 length = 0; int blockbits = inode->i_sb->s_blocksize_bits; int error = 0; u16 iomap_type; /* in-inode? */ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct ext4_iloc iloc; int offset; /* offset of xattr in inode */ error = ext4_get_inode_loc(inode, &iloc); if (error) return error; physical = (__u64)iloc.bh->b_blocknr << blockbits; offset = EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize; physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; brelse(iloc.bh); iomap_type = IOMAP_INLINE; } else if (EXT4_I(inode)->i_file_acl) { /* external block */ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; iomap_type = IOMAP_MAPPED; } else { /* no in-inode or external block for xattr, so return -ENOENT */ error = -ENOENT; goto out; } iomap->addr = physical; iomap->offset = 0; iomap->length = length; iomap->type = iomap_type; iomap->flags = 0; out: return error; } static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int error; error = ext4_iomap_xattr_fiemap(inode, iomap); if (error == 0 && (offset >= iomap->length)) error = -ENOENT; return error; } static const struct iomap_ops ext4_iomap_xattr_ops = { .iomap_begin = ext4_iomap_xattr_begin, }; static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { u64 maxbytes; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) maxbytes = inode->i_sb->s_maxbytes; else maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; if (*len == 0) return -EINVAL; if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; return 0; } int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int error = 0; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } /* * For bitmap files the maximum size limit could be smaller than * s_maxbytes, so check len here manually instead of just relying on the * generic check. */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) return error; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_xattr_ops); } return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk, len_blks; __u64 last_blk; int error = 0; if (ext4_has_inline_data(inode)) { int has_inline; down_read(&EXT4_I(inode)->xattr_sem); has_inline = ext4_has_inline_data(inode); up_read(&EXT4_I(inode)->xattr_sem); if (has_inline) return 0; } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } error = fiemap_prep(inode, fieinfo, start, &len, 0); if (error) return error; error = ext4_fiemap_check_ranges(inode, start, &len); if (error) return error; start_blk = start >> inode->i_sb->s_blocksize_bits; last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; if (last_blk >= EXT_MAX_BLOCKS) last_blk = EXT_MAX_BLOCKS-1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* * Walk the extent tree gathering extent information * and pushing extents back to the user. */ return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); } /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; bool update = false; int credits, restart_credits; depth = path->p_depth; while (depth >= 0) { if (depth == path->p_depth) { ex_start = path[depth].p_ext; if (!ex_start) return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); /* leaf + sb + inode */ credits = 3; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) { update = true; /* extent tree + sb + inode */ credits = depth + 2; } restart_credits = ext4_writepage_trans_blocks(inode); err = ext4_datasem_ensure_credits(handle, inode, credits, restart_credits, 0); if (err) { if (err > 0) err = -EAGAIN; goto out; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { le32_add_cpu(&ex_start->ee_block, -shift); /* Try to merge to the left. */ if ((ex_start > EXT_FIRST_EXTENT(path[depth].p_hdr)) && ext4_ext_try_to_merge_right(inode, path, ex_start - 1)) ex_last--; else ex_start++; } else { le32_add_cpu(&ex_last->ee_block, shift); ext4_ext_try_to_merge_right(inode, path, ex_last); ex_last--; } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; if (--depth < 0 || !update) break; } /* Update index too */ err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; if (SHIFT == SHIFT_LEFT) le32_add_cpu(&path[depth].p_idx->ei_block, -shift); else le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; /* we are done if current index is not a starting index */ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) break; depth--; } out: return err; } /* * ext4_ext_shift_extents: * All the extents which lies in the range from @start to the last allocated * block for the @inode are shifted either towards left or right (depending * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t start, ext4_lblk_t shift, enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; ext4_lblk_t stop, *iterator, ex_start, ex_end; ext4_lblk_t tmp = EXT_MAX_BLOCKS; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) goto out; stop = le32_to_cpu(extent->ee_block); /* * For left shifts, make sure the hole on the left is big enough to * accommodate the shift. For right shifts, make sure the last extent * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { path = ext4_find_extent(inode, start - 1, &path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (extent) { ex_start = le32_to_cpu(extent->ee_block); ex_end = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { ex_start = 0; ex_end = 0; } if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) { ret = -EINVAL; goto out; } } else { if (shift > EXT_MAX_BLOCKS - (stop + ext4_ext_get_actual_len(extent))) { ret = -EINVAL; goto out; } } /* * In case of left shift, iterator points to start and it is increased * till we reach stop. In case of right shift, iterator points to stop * and it is decreased till we reach start. */ again: ret = 0; if (SHIFT == SHIFT_LEFT) iterator = &start; else iterator = &stop; if (tmp != EXT_MAX_BLOCKS) *iterator = tmp; /* * Its safe to start updating extents. Start and stop are unsigned, so * in case of right shift if extent with 0 block is reached, iterator * becomes NULL to indicate the end of the loop. */ while (iterator && start <= stop) { path = ext4_find_extent(inode, *iterator, &path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); return -EFSCORRUPTED; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { *iterator = ext4_ext_next_allocated_block(path); continue; } } tmp = *iterator; if (SHIFT == SHIFT_LEFT) { extent = EXT_LAST_EXTENT(path[depth].p_hdr); *iterator = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); } else { extent = EXT_FIRST_EXTENT(path[depth].p_hdr); if (le32_to_cpu(extent->ee_block) > start) *iterator = le32_to_cpu(extent->ee_block) - 1; else if (le32_to_cpu(extent->ee_block) == start) iterator = NULL; else { extent = EXT_LAST_EXTENT(path[depth].p_hdr); while (le32_to_cpu(extent->ee_block) >= start) extent--; if (extent == EXT_LAST_EXTENT(path[depth].p_hdr)) break; extent++; iterator = NULL; } path[depth].p_ext = extent; } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, SHIFT); /* iterator can be NULL which means we should break */ if (ret == -EAGAIN) goto again; if (ret) break; } out: ext4_free_ext_path(path); return ret; } /* * ext4_collapse_range: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; loff_t new_size, ioffset; int ret; /* * We need to test this early because xfstests assumes that a * collapse range of (0, 1) will return EOPNOTSUPP if the file * system does not support collapse range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ if (offset + len >= inode->i_size) { ret = -EINVAL; goto out_mutex; } /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; goto out_mutex; } /* Wait for existing dio to complete */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_mmap; /* * Need to round down offset to be aligned with page size boundary * for page size > block size. */ ioffset = round_down(offset, PAGE_SIZE); /* * Write tail of the last page before removed range since it will get * removed from the page cache below. */ ret = filemap_write_and_wait_range(mapping, ioffset, offset); if (ret) goto out_mmap; /* * Write data that will be shifted to preserve them when discarding * page cache below. We are also protected from pages becoming dirty * by i_rwsem and invalidate_lock. */ ret = filemap_write_and_wait_range(mapping, offset + len, LLONG_MAX); if (ret) goto out_mmap; truncate_pagecache(inode, ioffset); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_mmap; } ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } ext4_discard_preallocations(inode, 0); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } new_size = inode->i_size - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } /* * ext4_insert_range: * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. * The data blocks starting from @offset to the EOF are shifted by @len * towards right to create a hole in the @inode. Inode size is increased * by len bytes. * Returns 0 on success, error otherwise. */ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; int ret = 0, depth, split_flag = 0; loff_t ioffset; /* * We need to test this early because xfstests assumes that an * insert range of (0, 1) will return EOPNOTSUPP if the file * system does not support insert range. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_insert_range(inode, offset, len); offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; goto out_mutex; } /* Check whether the maximum file size would be exceeded */ if (len > inode->i_sb->s_maxbytes - inode->i_size) { ret = -EFBIG; goto out_mutex; } /* Offset must be less than i_size */ if (offset >= inode->i_size) { ret = -EINVAL; goto out_mutex; } /* Wait for existing dio to complete */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_mmap; /* * Need to round down to align start offset to page size boundary * for page size > block size. */ ioffset = round_down(offset, PAGE_SIZE); /* Write out all dirty pages */ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, LLONG_MAX); if (ret) goto out_mmap; truncate_pagecache(inode, ioffset); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_mmap; } ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } depth = ext_depth(inode); extent = path[depth].p_ext; if (extent) { ee_start_lblk = le32_to_cpu(extent->ee_block); ee_len = ext4_ext_get_actual_len(extent); /* * If offset_lblk is not the starting block of extent, split * the extent @offset_lblk */ if ((offset_lblk > ee_start_lblk) && (offset_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; ret = ext4_split_extent_at(handle, inode, &path, offset_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); } ext4_free_ext_path(path); if (ret < 0) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } } else { ext4_free_ext_path(path); } ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); /* * if offset_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_mmap: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } /** * ext4_swap_extents() - Swap extents between two inodes * @handle: handle for this transaction * @inode1: First inode * @inode2: Second inode * @lblk1: Start block for first inode * @lblk2: Start block for second inode * @count: Number of blocks to swap * @unwritten: Mark second inode's extents as unwritten after swap * @erp: Pointer to save error value * * This helper routine does exactly what is promise "swap extents". All other * stuff such as page-cache locking consistency, bh mapping consistency or * extent's data copying must be performed by caller. * Locking: * i_rwsem is held for both inodes * i_data_sem is locked for write for both inodes * Assumptions: * All pages from requested range are locked for both inodes */ int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int unwritten, int *erp) { struct ext4_ext_path *path1 = NULL; struct ext4_ext_path *path2 = NULL; int replaced_count = 0; BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!inode_is_locked(inode1)); BUG_ON(!inode_is_locked(inode2)); ext4_es_remove_extent(inode1, lblk1, count); ext4_es_remove_extent(inode2, lblk2, count); while (count) { struct ext4_extent *ex1, *ex2, tmp_ex; ext4_lblk_t e1_blk, e2_blk; int e1_len, e2_len, len; int split = 0; path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); path1 = NULL; finish: count = 0; goto repeat; } path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); path2 = NULL; goto finish; } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) goto finish; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); e1_len = ext4_ext_get_actual_len(ex1); e2_len = ext4_ext_get_actual_len(ex2); /* Hole handling */ if (!in_range(lblk1, e1_blk, e1_len) || !in_range(lblk2, e2_blk, e2_len)) { ext4_lblk_t next1, next2; /* if hole after extent, then go to next extent */ next1 = ext4_ext_next_allocated_block(path1); next2 = ext4_ext_next_allocated_block(path2); /* If hole before extent, then shift to that extent */ if (e1_blk > lblk1) next1 = e1_blk; if (e2_blk > lblk2) next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) goto finish; /* Move to the rightest boundary */ len = next1 - lblk1; if (len < next2 - lblk2) len = next2 - lblk2; if (len > count) len = count; lblk1 += len; lblk2 += len; count -= len; goto repeat; } /* Prepare left boundary */ if (e1_blk < lblk1) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, &path1, lblk1, 0); if (unlikely(*erp)) goto finish; } if (e2_blk < lblk2) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, &path2, lblk2, 0); if (unlikely(*erp)) goto finish; } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) goto repeat; /* Prepare right boundary */ len = count; if (len > e1_blk + e1_len - lblk1) len = e1_blk + e1_len - lblk1; if (len > e2_blk + e2_len - lblk2) len = e2_blk + e2_len - lblk2; if (len != e1_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, &path1, lblk1 + len, 0); if (unlikely(*erp)) goto finish; } if (len != e2_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, &path2, lblk2 + len, 0); if (*erp) goto finish; } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) goto repeat; BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); if (unlikely(*erp)) goto finish; *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) goto finish; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); ex1->ee_len = cpu_to_le16(e2_len); ex2->ee_len = cpu_to_le16(e1_len); if (unwritten) ext4_ext_mark_unwritten(ex2); if (ext4_ext_is_unwritten(&tmp_ex)) ext4_ext_mark_unwritten(ex1); ext4_ext_try_to_merge(handle, inode2, path2, ex2); ext4_ext_try_to_merge(handle, inode1, path1, ex1); *erp = ext4_ext_dirty(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) goto finish; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* * Looks scarry ah..? second inode already points to new blocks, * and it was successfully dirtied. But luckily error may happen * only due to journal error, so full transaction will be * aborted anyway. */ if (unlikely(*erp)) goto finish; lblk1 += len; lblk2 += len; replaced_count += len; count -= len; repeat: ext4_free_ext_path(path1); ext4_free_ext_path(path2); path1 = path2 = NULL; } return replaced_count; } /* * ext4_clu_mapped - determine whether any block in a logical cluster has * been mapped to a physical cluster * * @inode - file containing the logical cluster * @lclu - logical cluster of interest * * Returns 1 if any block in the logical cluster is mapped, signifying * that a physical cluster has been allocated for it. Otherwise, * returns 0. Can also return negative error codes. Derived from * ext4_ext_map_blocks(). */ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_ext_path *path; int depth, mapped = 0, err = 0; struct ext4_extent *extent; ext4_lblk_t first_lblk, first_lclu, last_lclu; /* * if data can be stored inline, the logical cluster isn't * mapped - no physical clusters have been allocated, and the * file has no extents */ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) || ext4_has_inline_data(inode)) return 0; /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; goto out; } depth = ext_depth(inode); /* * A consistent leaf must not be empty. This situation is possible, * though, _during_ tree modification, and it's why an assert can't * be put in ext4_find_extent(). */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address - lblock: %lu, depth: %d, pblock: %lld", (unsigned long) EXT4_C2B(sbi, lclu), depth, path[depth].p_block); err = -EFSCORRUPTED; goto out; } extent = path[depth].p_ext; /* can't be mapped if the extent tree is empty */ if (extent == NULL) goto out; first_lblk = le32_to_cpu(extent->ee_block); first_lclu = EXT4_B2C(sbi, first_lblk); /* * Three possible outcomes at this point - found extent spanning * the target cluster, to the left of the target cluster, or to the * right of the target cluster. The first two cases are handled here. * The last case indicates the target cluster is not mapped. */ if (lclu >= first_lclu) { last_lclu = EXT4_B2C(sbi, first_lblk + ext4_ext_get_actual_len(extent) - 1); if (lclu <= last_lclu) { mapped = 1; } else { first_lblk = ext4_ext_next_allocated_block(path); first_lclu = EXT4_B2C(sbi, first_lblk); if (lclu == first_lclu) mapped = 1; } } out: ext4_free_ext_path(path); return err ? err : mapped; } /* * Updates physical block address and unwritten status of extent * starting at lblk start and of len. If such an extent doesn't exist, * this function splits the extent tree appropriately to create an * extent like this. This function is called in the fast commit * replay path. Returns 0 on success and error on failure. */ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk) { struct ext4_ext_path *path = NULL, *ppath; struct ext4_extent *ex; int ret; path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ret = -EFSCORRUPTED; goto out; } if (le32_to_cpu(ex->ee_block) != start || ext4_ext_get_actual_len(ex) != len) { /* We need to split this extent to match our extent first */ ppath = path; down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; kfree(path); path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return -1; ppath = path; ex = path[path->p_depth].p_ext; WARN_ON(le32_to_cpu(ex->ee_block) != start); if (ext4_ext_get_actual_len(ex) != len) { down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_force_split_extent_at(NULL, inode, &ppath, start + len, 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; kfree(path); path = ext4_find_extent(inode, start, NULL, 0); if (IS_ERR(path)) return -EINVAL; ex = path[path->p_depth].p_ext; } } if (unwritten) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); ext4_ext_store_pblock(ex, pblk); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); out: ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return ret; } /* Try to shrink the extent tree */ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t old_cur, cur = 0; while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) return; ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); ext4_mark_inode_dirty(NULL, inode); return; } old_cur = cur; cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); if (cur <= old_cur) cur = old_cur + 1; ext4_ext_try_to_merge(NULL, inode, path, ex); down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_dirty(NULL, inode, &path[path->p_depth]); up_write(&EXT4_I(inode)->i_data_sem); ext4_mark_inode_dirty(NULL, inode); ext4_free_ext_path(path); } } /* Check if *cur is a hole and if it is, skip it */ static int skip_hole(struct inode *inode, ext4_lblk_t *cur) { int ret; struct ext4_map_blocks map; map.m_lblk = *cur; map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret != 0) return 0; *cur = *cur + map.m_len; return 0; } /* Count number of blocks used by this inode and update i_blocks */ int ext4_ext_replay_set_iblocks(struct inode *inode) { struct ext4_ext_path *path = NULL, *path2 = NULL; struct ext4_extent *ex; ext4_lblk_t cur = 0, end; int numblks = 0, i, ret = 0; ext4_fsblk_t cmp1, cmp2; struct ext4_map_blocks map; /* Determin the size of the file first */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); goto out; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) numblks += ret; cur = cur + map.m_len; } /* * Count the number of extent tree blocks. We do it by looking up * two successive extents and determining the difference between * their paths. When path is different for 2 successive extents * we compare the blocks in the path at each level and increment * iblocks by total number of differences found. */ cur = 0; ret = skip_hole(inode, &cur); if (ret < 0) goto out; path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; numblks += path->p_depth; ext4_free_ext_path(path); while (cur < end) { path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); return 0; } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); if (ret < 0) { ext4_free_ext_path(path); break; } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { ext4_free_ext_path(path); break; } for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { cmp1 = cmp2 = 0; if (i <= path->p_depth) cmp1 = path[i].p_bh ? path[i].p_bh->b_blocknr : 0; if (i <= path2->p_depth) cmp2 = path2[i].p_bh ? path2[i].p_bh->b_blocknr : 0; if (cmp1 != cmp2 && cmp2 != 0) numblks++; } ext4_free_ext_path(path); ext4_free_ext_path(path2); } out: inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); ext4_mark_inode_dirty(NULL, inode); return 0; } int ext4_ext_clear_bb(struct inode *inode) { struct ext4_ext_path *path = NULL; struct ext4_extent *ex; ext4_lblk_t cur = 0, end; int j, ret = 0; struct ext4_map_blocks map; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) return 0; /* Determin the size of the file first */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ext4_free_ext_path(path); return 0; } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); ext4_free_ext_path(path); cur = 0; while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, NULL, 0); if (!IS_ERR_OR_NULL(path)) { for (j = 0; j < path->p_depth; j++) { ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } ext4_free_ext_path(path); } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, map.m_lblk, map.m_pblk, map.m_len, 1); } cur = cur + map.m_len; } return 0; } |
| 834 833 833 834 834 833 834 833 834 834 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 | // SPDX-License-Identifier: GPL-2.0 /* * device_cgroup.c - device cgroup subsystem * * Copyright 2007 IBM Corp */ #include <linux/bpf-cgroup.h> #include <linux/device_cgroup.h> #include <linux/cgroup.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #ifdef CONFIG_CGROUP_DEVICE static DEFINE_MUTEX(devcgroup_mutex); enum devcg_behavior { DEVCG_DEFAULT_NONE, DEVCG_DEFAULT_ALLOW, DEVCG_DEFAULT_DENY, }; /* * exception list locking rules: * hold devcgroup_mutex for update/read. * hold rcu_read_lock() for read. */ struct dev_exception_item { u32 major, minor; short type; short access; struct list_head list; struct rcu_head rcu; }; struct dev_cgroup { struct cgroup_subsys_state css; struct list_head exceptions; enum devcg_behavior behavior; }; static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { return css_to_devcgroup(task_css(task, devices_cgrp_id)); } /* * called under devcgroup_mutex */ static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp, *new; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry(ex, orig, list) { new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!new) goto free_and_exit; list_add_tail(&new->list, dest); } return 0; free_and_exit: list_for_each_entry_safe(ex, tmp, dest, list) { list_del(&ex->list); kfree(ex); } return -ENOMEM; } static void dev_exceptions_move(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(ex, tmp, orig, list) { list_move_tail(&ex->list, dest); } } /* * called under devcgroup_mutex */ static int dev_exception_add(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *excopy, *walk; lockdep_assert_held(&devcgroup_mutex); excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!excopy) return -ENOMEM; list_for_each_entry(walk, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access |= ex->access; kfree(excopy); excopy = NULL; } if (excopy != NULL) list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); return 0; } /* * called under devcgroup_mutex */ static void dev_exception_rm(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *walk, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access &= ~ex->access; if (!walk->access) { list_del_rcu(&walk->list); kfree_rcu(walk, rcu); } } } static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) { struct dev_exception_item *ex, *tmp; list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { list_del_rcu(&ex->list); kfree_rcu(ex, rcu); } } /** * dev_exception_clean - frees all entries of the exception list * @dev_cgroup: dev_cgroup with the exception list to be cleaned * * called under devcgroup_mutex */ static void dev_exception_clean(struct dev_cgroup *dev_cgroup) { lockdep_assert_held(&devcgroup_mutex); __dev_exception_clean(dev_cgroup); } static inline bool is_devcg_online(const struct dev_cgroup *devcg) { return (devcg->behavior != DEVCG_DEFAULT_NONE); } /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's * @css: css getting online * returns 0 in case of success, error code otherwise */ static int devcgroup_online(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent); int ret = 0; mutex_lock(&devcgroup_mutex); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; else { ret = dev_exceptions_copy(&dev_cgroup->exceptions, &parent_dev_cgroup->exceptions); if (!ret) dev_cgroup->behavior = parent_dev_cgroup->behavior; } mutex_unlock(&devcgroup_mutex); return ret; } static void devcgroup_offline(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; mutex_unlock(&devcgroup_mutex); } /* * called from kernel/cgroup/cgroup.c with cgroup_lock() held. */ static struct cgroup_subsys_state * devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); if (!dev_cgroup) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dev_cgroup->exceptions); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; return &dev_cgroup->css; } static void devcgroup_css_free(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } #define DEVCG_ALLOW 1 #define DEVCG_DENY 2 #define DEVCG_LIST 3 #define MAJMINLEN 13 #define ACCLEN 4 static void set_access(char *acc, short access) { int idx = 0; memset(acc, 0, ACCLEN); if (access & DEVCG_ACC_READ) acc[idx++] = 'r'; if (access & DEVCG_ACC_WRITE) acc[idx++] = 'w'; if (access & DEVCG_ACC_MKNOD) acc[idx++] = 'm'; } static char type_to_char(short type) { if (type == DEVCG_DEV_ALL) return 'a'; if (type == DEVCG_DEV_CHAR) return 'c'; if (type == DEVCG_DEV_BLOCK) return 'b'; return 'X'; } static void set_majmin(char *str, unsigned m) { if (m == ~0) strcpy(str, "*"); else sprintf(str, "%u", m); } static int devcgroup_seq_show(struct seq_file *m, void *v) { struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; rcu_read_lock(); /* * To preserve the compatibility: * - Only show the "all devices" when the default policy is to allow * - List the exceptions in case the default policy is to deny * This way, the file remains as a "whitelist of devices" */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { set_access(acc, DEVCG_ACC_MASK); set_majmin(maj, ~0); set_majmin(min, ~0); seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL), maj, min, acc); } else { list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { set_access(acc, ex->access); set_majmin(maj, ex->major); set_majmin(min, ex->minor); seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), maj, min, acc); } } rcu_read_unlock(); return 0; } /** * match_exception - iterates the exception list trying to find a complete match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a complete match if an exception is found that will * contain the entire range of provided parameters. * * Return: true in case it matches an exception completely */ static bool match_exception(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; if (ex->major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && ex->minor != minor) continue; /* provided access cannot have more than the exception rule */ if (access & (~ex->access)) continue; return true; } return false; } /** * match_exception_partial - iterates the exception list trying to find a partial match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a partial match if an exception's range is found to * contain *any* of the devices specified by provided parameters. This is * used to make sure no extra access is being granted that is forbidden by * any of the exception list. * * Return: true in case the provided range mat matches an exception completely */ static bool match_exception_partial(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list, lockdep_is_held(&devcgroup_mutex)) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; /* * We must be sure that both the exception and the provided * range aren't masking all devices */ if (ex->major != ~0 && major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && minor != ~0 && ex->minor != minor) continue; /* * In order to make sure the provided range isn't matching * an exception, all its access bits shouldn't match the * exception's access bits */ if (!(access & ex->access)) continue; return true; } return false; } /** * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions * @dev_cgroup: dev cgroup to be tested against * @refex: new exception * @behavior: behavior of the exception's dev_cgroup * * This is used to make sure a child cgroup won't have more privileges * than its parent */ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, struct dev_exception_item *refex, enum devcg_behavior behavior) { bool match = false; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&devcgroup_mutex), "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { if (behavior == DEVCG_DEFAULT_ALLOW) { /* * new exception in the child doesn't matter, only * adding extra restrictions */ return true; } else { /* * new exception in the child will add more devices * that can be accessed, so it can't match any of * parent's exceptions, even slightly */ match = match_exception_partial(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) return false; return true; } } else { /* * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore * the new exception will add access to more devices and must * be contained completely in an parent's exception to be * allowed */ match = match_exception(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) /* parent has an exception that matches the proposed */ return true; else return false; } return false; } /* * parent_has_perm: * when adding a new allow rule to a device exception list, the rule * must be allowed in the parent device */ static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return 1; return verify_new_ex(parent, ex, childcg->behavior); } /** * parent_allows_removal - verify if it's ok to remove an exception * @childcg: child cgroup from where the exception will be removed * @ex: exception being removed * * When removing an exception in cgroups with default ALLOW policy, it must * be checked if removing it will give the child cgroup more access than the * parent. * * Return: true if it's ok to remove exception, false otherwise */ static bool parent_allows_removal(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return true; /* It's always allowed to remove access to devices */ if (childcg->behavior == DEVCG_DEFAULT_DENY) return true; /* * Make sure you're not removing part or a whole exception existing in * the parent cgroup */ return !match_exception_partial(&parent->exceptions, ex->type, ex->major, ex->minor, ex->access); } /** * may_allow_all - checks if it's possible to change the behavior to * allow based on parent's rules. * @parent: device cgroup's parent * returns: != 0 in case it's allowed, 0 otherwise */ static inline int may_allow_all(struct dev_cgroup *parent) { if (!parent) return 1; return parent->behavior == DEVCG_DEFAULT_ALLOW; } /** * revalidate_active_exceptions - walks through the active exception list and * revalidates the exceptions based on parent's * behavior and exceptions. The exceptions that * are no longer valid will be removed. * Called with devcgroup_mutex held. * @devcg: cgroup which exceptions will be checked * * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { struct dev_exception_item *ex; struct list_head *this, *tmp; list_for_each_safe(this, tmp, &devcg->exceptions) { ex = container_of(this, struct dev_exception_item, list); if (!parent_has_perm(devcg, ex)) dev_exception_rm(devcg, ex); } } /** * propagate_exception - propagates a new exception to the children * @devcg_root: device cgroup that added a new exception * @ex: new exception to be propagated * * returns: 0 in case of success, != 0 in case of error */ static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); css_for_each_descendant_pre(pos, &devcg_root->css) { struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become * online or offline during the tree walk (see on/offline * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); /* * in case both root's behavior and devcg is allow, a new * restriction means adding to the exception list */ if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW && devcg->behavior == DEVCG_DEFAULT_ALLOW) { rc = dev_exception_add(devcg, ex); if (rc) return rc; } else { /* * in the other possible cases: * root's behavior: allow, devcg's: deny * root's behavior: deny, devcg's: deny * the exception will be removed */ dev_exception_rm(devcg, ex); } revalidate_active_exceptions(devcg); rcu_read_lock(); } rcu_read_unlock(); return rc; } /* * Modify the exception list using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD * so we can give a container CAP_MKNOD to let it create devices but not * modify the exception list. * It seems likely we'll want to add a CAP_CONTAINER capability to allow * us to also grant CAP_SYS_ADMIN to containers without giving away the * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN * * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting * new access is only allowed if you're in the top-level cgroup, or your * parent cgroup has the access you're asking for. */ static int devcgroup_update_access(struct dev_cgroup *devcgroup, int filetype, char *buffer) { const char *b; char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent); struct dev_cgroup tmp_devcgrp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(&ex, 0, sizeof(ex)); memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp)); b = buffer; switch (*b) { case 'a': switch (filetype) { case DEVCG_ALLOW: if (css_has_online_children(&devcgroup->css)) return -EINVAL; if (!may_allow_all(parent)) return -EPERM; if (!parent) { devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(devcgroup); break; } INIT_LIST_HEAD(&tmp_devcgrp.exceptions); rc = dev_exceptions_copy(&tmp_devcgrp.exceptions, &devcgroup->exceptions); if (rc) return rc; dev_exception_clean(devcgroup); rc = dev_exceptions_copy(&devcgroup->exceptions, &parent->exceptions); if (rc) { dev_exceptions_move(&devcgroup->exceptions, &tmp_devcgrp.exceptions); return rc; } devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(&tmp_devcgrp); break; case DEVCG_DENY: if (css_has_online_children(&devcgroup->css)) return -EINVAL; dev_exception_clean(devcgroup); devcgroup->behavior = DEVCG_DEFAULT_DENY; break; default: return -EINVAL; } return 0; case 'b': ex.type = DEVCG_DEV_BLOCK; break; case 'c': ex.type = DEVCG_DEV_CHAR; break; default: return -EINVAL; } b++; if (!isspace(*b)) return -EINVAL; b++; if (*b == '*') { ex.major = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.major); if (rc) return -EINVAL; } else { return -EINVAL; } if (*b != ':') return -EINVAL; b++; /* read minor */ if (*b == '*') { ex.minor = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.minor); if (rc) return -EINVAL; } else { return -EINVAL; } if (!isspace(*b)) return -EINVAL; for (b++, count = 0; count < 3; count++, b++) { switch (*b) { case 'r': ex.access |= DEVCG_ACC_READ; break; case 'w': ex.access |= DEVCG_ACC_WRITE; break; case 'm': ex.access |= DEVCG_ACC_MKNOD; break; case '\n': case '\0': count = 3; break; default: return -EINVAL; } } switch (filetype) { case DEVCG_ALLOW: /* * If the default policy is to allow by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { /* Check if the parent allows removing it first */ if (!parent_allows_removal(devcgroup, &ex)) return -EPERM; dev_exception_rm(devcgroup, &ex); break; } if (!parent_has_perm(devcgroup, &ex)) return -EPERM; rc = dev_exception_add(devcgroup, &ex); break; case DEVCG_DENY: /* * If the default policy is to deny by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_DENY) dev_exception_rm(devcgroup, &ex); else rc = dev_exception_add(devcgroup, &ex); if (rc) break; /* we only propagate new restrictions */ rc = propagate_exception(devcgroup, &ex); break; default: rc = -EINVAL; } return rc; } static ssize_t devcgroup_access_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int retval; mutex_lock(&devcgroup_mutex); retval = devcgroup_update_access(css_to_devcgroup(of_css(of)), of_cft(of)->private, strstrip(buf)); mutex_unlock(&devcgroup_mutex); return retval ?: nbytes; } static struct cftype dev_cgroup_files[] = { { .name = "allow", .write = devcgroup_access_write, .private = DEVCG_ALLOW, }, { .name = "deny", .write = devcgroup_access_write, .private = DEVCG_DENY, }, { .name = "list", .seq_show = devcgroup_seq_show, .private = DEVCG_LIST, }, { } /* terminate */ }; struct cgroup_subsys devices_cgrp_subsys = { .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online, .css_offline = devcgroup_offline, .legacy_cftypes = dev_cgroup_files, }; /** * devcgroup_legacy_check_permission - checks if an inode operation is permitted * @type: device type * @major: device major number * @minor: device minor number * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD * * returns 0 on success, -EPERM case the operation is not permitted */ static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor, short access) { struct dev_cgroup *dev_cgroup; bool rc; rcu_read_lock(); dev_cgroup = task_devcgroup(current); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) /* Can't match any of the exceptions, even partially */ rc = !match_exception_partial(&dev_cgroup->exceptions, type, major, minor, access); else /* Need to match completely one exception to be allowed */ rc = match_exception(&dev_cgroup->exceptions, type, major, minor, access); rcu_read_unlock(); if (!rc) return -EPERM; return 0; } #endif /* CONFIG_CGROUP_DEVICE */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); if (rc) return rc; #ifdef CONFIG_CGROUP_DEVICE return devcgroup_legacy_check_permission(type, major, minor, access); #else /* CONFIG_CGROUP_DEVICE */ return 0; #endif /* CONFIG_CGROUP_DEVICE */ } EXPORT_SYMBOL(devcgroup_check_permission); #endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */ |
| 50 43 43 43 4 3 2 1 25 25 6 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Authors: Lotsa people, from code originally in tcp */ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H #include <linux/interrupt.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_sock.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/netns/hash.h> #include <linux/refcount.h> #include <asm/byteorder.h> /* This is for all connections with a full identity, no wildcards. * The 'e' prefix stands for Establish, but we really put all sockets * but LISTEN ones. */ struct inet_ehash_bucket { struct hlist_nulls_head chain; }; /* There are a few simple rules, which allow for local port reuse by * an application. In essence: * * 1) Sockets bound to different interfaces may share a local port. * Failing that, goto test 2. * 2) If all sockets have sk->sk_reuse set, and none of them are in * TCP_LISTEN state, the port may be shared. * Failing that, goto test 3. * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local * address, and none of them are the same, the port may be * shared. * Failing this, the port cannot be shared. * * The interesting point, is test #2. This is what an FTP server does * all day. To optimize this case we use a specific flag bit defined * below. As we add sockets to a bind bucket list, we perform a * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) * As long as all sockets added to a bind bucket pass this test, * the flag bit will be set. * The resulting situation is that tcp_v[46]_verify_bind() can just check * for this flag bit, if it is set and the socket trying to bind has * sk->sk_reuse set, we don't even have to walk the owners list at all, * we return that it is ok to bind this socket to the requested local port. * * Sounds like a lot of work, but it is worth it. In a more naive * implementation (ie. current FreeBSD etc.) the entire list of ports * must be walked for each data port opened by an ftp server. Needless * to say, this does not scale at all. With a couple thousand FTP * users logged onto your box, isn't it nice to know that new data * ports are created in O(1) time? I thought so. ;-) -DaveM */ #define FASTREUSEPORT_ANY 1 #define FASTREUSEPORT_STRICT 2 struct inet_bind_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr fast_v6_rcv_saddr; #endif __be32 fast_rcv_saddr; unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; }; struct inet_bind2_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) unsigned short addr_type; struct in6_addr v6_rcv_saddr; #define rcv_saddr v6_rcv_saddr.s6_addr32[3] #else __be32 rcv_saddr; #endif /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) { return read_pnet(&ib->ib_net); } #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) struct inet_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without * RCU grace period. A lookup in ehash table needs to handle this case. */ #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ struct inet_hashinfo { /* This is for sockets with full identity only. Sockets here will * always be without wildcards and will have the following invariant: * * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE * */ struct inet_ehash_bucket *ehash; spinlock_t *ehash_locks; unsigned int ehash_mask; unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ struct kmem_cache *bind_bucket_cachep; /* This bind table is hashed by local port */ struct inet_bind_hashbucket *bhash; struct kmem_cache *bind2_bucket_cachep; /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used * primarily for expediting bind conflict resolution. */ struct inet_bind_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; bool pernet; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { #if IS_ENABLED(CONFIG_IP_DCCP) return sk->sk_prot->h.hashinfo ? : sock_net(sk)->ipv4.tcp_death_row.hashinfo; #else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; #endif } static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { return &h->lhash2[hash & h->lhash2_mask]; } static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash[hash & hashinfo->ehash_mask]; } static inline spinlock_t *inet_ehash_lockp( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; } int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) { kfree(h->lhash2); h->lhash2 = NULL; } static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); hashinfo->ehash_locks = NULL; } struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev); struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb); struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { return (lport + net_hash_mix(net)) & (bhash_size - 1); } static inline struct inet_bind_hashbucket * inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, const struct net *net, unsigned short port) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); else #endif hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); /* This should be called whenever a socket's sk_rcv_saddr (ipv4) or * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's * rcv_saddr field should already have been updated when this is called. */ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); void inet_bhash2_reset_saddr(struct sock *sk); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif) { return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } /* Socket demux engine toys. */ /* What happens here is ugly; there's a pair of adjacent fields in struct inet_sock; __be16 dport followed by __u16 num. We want to search by pair, so we combine the keys into a single 32bit value and compare with 32bit value read from &...->dport. Let's at least make sure that it's not mixed with anything else... On 64bit targets we combine comparisons with pair of adjacent __be32 fields in the same way. */ #ifdef __BIG_ENDIAN #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport))) #else /* __LITTLE_ENDIAN */ #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) #endif #ifdef __BIG_ENDIAN #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__saddr)) << 32) | \ ((__force __u64)(__be32)(__daddr))) #else /* __LITTLE_ENDIAN */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ static inline bool inet_match(struct net *net, const struct sock *sk, const __addrpair cookie, const __portpair ports, int dif, int sdif) { if (!net_eq(sock_net(sk), net) || sk->sk_portpair != ports || sk->sk_addrpair != cookie) return false; /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif); } /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); typedef u32 (inet_ehashfn_t)(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport); inet_ehashfn_t inet_ehashfn; INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn); struct sock *inet_lookup_run_sk_lookup(struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif, const int sdif, bool *refcounted) { u16 hnum = ntohs(dport); struct sock *sk; sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } static inline struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, bool *refcounted, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool prefetched; sk = skb_steal_sock(skb, refcounted, &prefetched); if (!sk) return NULL; if (!prefetched || !sk_fullsock(sk)) return sk; if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; } else if (sk->sk_protocol == IPPROTO_UDP) { if (sk->sk_state != TCP_CLOSE) return sk; } else { return sk; } reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, ntohs(dport), ehashfn); if (!reuse_sk) return sk; /* We've chosen a new reuseport sock which is never refcounted. This * implies that sk also isn't refcounted. */ WARN_ON_ONCE(*refcounted); return reuse_sk; } static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, const int sdif, bool *refcounted) { struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, refcounted, inet_ehashfn); if (IS_ERR(sk)) return NULL; if (sk) return sk; return __inet_lookup(net, hashinfo, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr); #endif } static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) { sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr); #endif } int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); #endif /* _INET_HASHTABLES_H */ |
| 2 43 44 4 2 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tcp #if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TCP_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include <net/tcp.h> #include <linux/sock_diag.h> #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ ipv6_addr_set_v4mapped(saddr, pin6); \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ ipv6_addr_set_v4mapped(daddr, pin6); \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ do { \ if (sk->sk_family == AF_INET6) { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ *pin6 = saddr6; \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ *pin6 = daddr6; \ } else { \ TP_STORE_V4MAPPED(__entry, saddr, daddr); \ } \ } while (0) #else #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ TP_STORE_V4MAPPED(__entry, saddr, daddr) #endif /* * tcp event with arguments sk and skb * * Note: this class requires a valid sk pointer; while skb pointer could * be NULL. */ DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skbaddr = skb; __entry->skaddr = sk; __entry->state = sk->sk_state; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->state)) ); DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * skb of trace_tcp_send_reset is the skb that caused RST. In case of * active reset, skb should be NULL */ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * tcp event with arguments sk * * Note: this class requires a valid sk pointer. */ DECLARE_EVENT_CLASS(tcp_event_sk, TP_PROTO(struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u64, sock_cookie) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->sock_cookie) ); DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), TP_ARGS(sk, req), TP_STRUCT__entry( __field(const void *, skaddr) __field(const void *, req) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_request_sock *ireq = inet_rsk(req); __be32 *p32; __entry->skaddr = sk; __entry->req = req; __entry->sport = ireq->ir_num; __entry->dport = ntohs(ireq->ir_rmt_port); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = ireq->ir_loc_addr; p32 = (__be32 *) __entry->daddr; *p32 = ireq->ir_rmt_addr; TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) ); #include <trace/events/net_probe_common.h> TRACE_EVENT(tcp_probe, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u32, mark) __field(__u16, data_len) __field(__u32, snd_nxt) __field(__u32, snd_una) __field(__u32, snd_cwnd) __field(__u32, ssthresh) __field(__u32, snd_wnd) __field(__u32, srtt) __field(__u32, rcv_wnd) __field(__u64, sock_cookie) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; __entry->family = sk->sk_family; __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tcp_snd_cwnd(tp); __entry->snd_wnd = tp->snd_wnd; __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->mark, __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); #define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) \ do { \ const struct tcphdr *th = (const struct tcphdr *)skb->data; \ struct sockaddr_in *v4 = (void *)__entry->saddr; \ \ v4->sin_family = AF_INET; \ v4->sin_port = th->source; \ v4->sin_addr.s_addr = ip_hdr(skb)->saddr; \ v4 = (void *)__entry->daddr; \ v4->sin_family = AF_INET; \ v4->sin_port = th->dest; \ v4->sin_addr.s_addr = ip_hdr(skb)->daddr; \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ do { \ const struct iphdr *iph = ip_hdr(skb); \ \ if (iph->version == 6) { \ const struct tcphdr *th = (const struct tcphdr *)skb->data; \ struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ \ v6->sin6_family = AF_INET6; \ v6->sin6_port = th->source; \ v6->sin6_addr = ipv6_hdr(skb)->saddr; \ v6 = (void *)__entry->daddr; \ v6->sin6_family = AF_INET6; \ v6->sin6_port = th->dest; \ v6->sin6_addr = ipv6_hdr(skb)->daddr; \ } else \ TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb); \ } while (0) #else #define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) #endif /* * tcp event with only skb */ DECLARE_EVENT_CLASS(tcp_event_skb, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field(const void *, skbaddr) __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->skbaddr = skb; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS_SKB(__entry, skb); ), TP_printk("src=%pISpc dest=%pISpc", __entry->saddr, __entry->daddr) ); DEFINE_EVENT(tcp_event_skb, tcp_bad_csum, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); TRACE_EVENT(tcp_cong_state_set, TP_PROTO(struct sock *sk, const u8 ca_state), TP_ARGS(sk, ca_state), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u8, cong_state) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->cong_state = ca_state; ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->cong_state) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2015 Regents of the University of California */ #ifndef _ASM_RISCV_CACHEFLUSH_H #define _ASM_RISCV_CACHEFLUSH_H #include <linux/mm.h> static inline void local_flush_icache_all(void) { asm volatile ("fence.i" ::: "memory"); } #define PG_dcache_clean PG_arch_1 static inline void flush_dcache_folio(struct folio *folio) { if (test_bit(PG_dcache_clean, &folio->flags)) clear_bit(PG_dcache_clean, &folio->flags); } #define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 static inline void flush_dcache_page(struct page *page) { flush_dcache_folio(page_folio(page)); } /* * RISC-V doesn't have an instruction to flush parts of the instruction cache, * so instead we just flush the whole thing. */ #define flush_icache_range(start, end) flush_icache_all() #define flush_icache_user_page(vma, pg, addr, len) \ flush_icache_mm(vma->vm_mm, 0) #ifdef CONFIG_64BIT #define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end) #define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end) #endif #ifndef CONFIG_SMP #define flush_icache_all() local_flush_icache_all() #define flush_icache_mm(mm, local) flush_icache_all() #else /* CONFIG_SMP */ void flush_icache_all(void); void flush_icache_mm(struct mm_struct *mm, bool local); #endif /* CONFIG_SMP */ extern unsigned int riscv_cbom_block_size; extern unsigned int riscv_cboz_block_size; void riscv_init_cbo_blocksizes(void); #ifdef CONFIG_RISCV_DMA_NONCOHERENT void riscv_noncoherent_supported(void); void __init riscv_set_dma_cache_alignment(void); #else static inline void riscv_noncoherent_supported(void) {} static inline void riscv_set_dma_cache_alignment(void) {} #endif /* * Bits in sys_riscv_flush_icache()'s flags argument. */ #define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL #define SYS_RISCV_FLUSH_ICACHE_ALL (SYS_RISCV_FLUSH_ICACHE_LOCAL) #include <asm-generic/cacheflush.h> #endif /* _ASM_RISCV_CACHEFLUSH_H */ |
| 9 10 5 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" #define DEVLINK_PORT_FN_CAPS_VALID_MASK \ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, DEVLINK_PORT_FN_STATE_ACTIVE), [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), }; #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ WARN_ON_ONCE(!(devlink_port)->registered) #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ WARN_ON_ONCE((devlink_port)->registered) struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { return xa_load(&devlink->ports, port_index); } struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { if (attrs[DEVLINK_ATTR_PORT_INDEX]) { u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return ERR_PTR(-ENODEV); return devlink_port; } return ERR_PTR(-EINVAL); } struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_port_get_from_attrs(devlink, info->attrs); } static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, u32 cap, bool is_enable) { caps->selector |= cap; if (is_enable) caps->value |= cap; } static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_roce_get) return 0; err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); return 0; } static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_migratable_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_migratable_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); return 0; } static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_crypto_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable); return 0; } static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port, struct nla_bitfield32 *caps, struct netlink_ext_ack *extack) { bool is_enable; int err; if (!devlink_port->ops->port_fn_ipsec_packet_get || devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) return 0; err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable); return 0; } static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { struct nla_bitfield32 caps = {}; int err; err = devlink_port_fn_roce_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack); if (err) return err; err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack); if (err) return err; if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, caps.selector); if (err) return err; *msg_updated = true; return 0; } int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) { if (devlink_nl_put_handle(msg, devlink_port->devlink)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) return -EMSGSIZE; return 0; } size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ } static int devlink_nl_port_attrs_put(struct sk_buff *msg, struct devlink_port *devlink_port) { struct devlink_port_attrs *attrs = &devlink_port->attrs; if (!devlink_port->attrs_set) return 0; if (attrs->lanes) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes)) return -EMSGSIZE; } if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable)) return -EMSGSIZE; if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_pf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_vf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, attrs->pci_sf.controller) || nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_sf.pf) || nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER, attrs->pci_sf.sf)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->phys.port_number)) return -EMSGSIZE; if (!attrs->split) return 0; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->phys.port_number)) return -EMSGSIZE; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, attrs->phys.split_subport_number)) return -EMSGSIZE; break; default: break; } return 0; } static int devlink_port_fn_hw_addr_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; int err; if (!port->ops->port_fn_hw_addr_get) return 0; err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr); if (err) return err; *msg_updated = true; return 0; } static bool devlink_port_fn_state_valid(enum devlink_port_fn_state state) { return state == DEVLINK_PORT_FN_STATE_INACTIVE || state == DEVLINK_PORT_FN_STATE_ACTIVE; } static bool devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) { return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED || opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } static int devlink_port_fn_state_fill(struct devlink_port *port, struct sk_buff *msg, struct netlink_ext_ack *extack, bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; int err; if (!port->ops->port_fn_state_get) return 0; err = port->ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (!devlink_port_fn_state_valid(state)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid state read from driver"); return -EINVAL; } if (!devlink_port_fn_opstate_valid(opstate)) { WARN_ON_ONCE(1); NL_SET_ERR_MSG(extack, "Invalid operational state read from driver"); return -EINVAL; } if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) || nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate)) return -EMSGSIZE; *msg_updated = true; return 0; } static int devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_migratable_set(devlink_port, enable, extack); } static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_roce_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack); } static int devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) { return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack); } static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nla_bitfield32 caps; u32 caps_value; int err; caps = nla_get_bitfield32(attr); caps_value = caps.value & caps.selector; if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { err = devlink_port_fn_roce_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_ROCE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { err = devlink_port_fn_mig_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_MIGRATABLE, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, extack); if (err) return err; } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value & DEVLINK_PORT_FN_CAP_IPSEC_PACKET, extack); if (err) return err; } return 0; } static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { struct nlattr *function_attr; bool msg_updated = false; int err; function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION); if (!function_attr) return -EMSGSIZE; err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated); if (err) goto out; err = devlink_rel_devlink_handle_put(msg, port->devlink, port->rel_index, DEVLINK_PORT_FN_ATTR_DEVLINK, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); else nla_nest_end(msg, function_attr); return err; } static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET && nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { if (devlink_port->type_eth.netdev && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, devlink_port->type_eth.ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, devlink_port->type_eth.ifname))) goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { struct ib_device *ibdev = devlink_port->type_ib.ibdev; if (ibdev && nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, ibdev->name)) goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) goto nla_put_failure; if (devlink_port->linecard && nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, devlink_linecard_index(devlink_port->linecard))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_port_notify(struct devlink_port *devlink_port, enum devlink_command cmd) { struct devlink *devlink = devlink_port->devlink; struct devlink_obj_desc desc; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_obj_desc_init(&desc, devlink); devlink_nl_obj_desc_port_set(&desc, devlink_port); devlink_nl_notify_send_desc(devlink, msg, &desc); } static void devlink_ports_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_port *devlink_port; unsigned long port_index; xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, cmd); } void devlink_ports_notify_register(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW); } void devlink_ports_notify_unregister(struct devlink *devlink) { devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL); } int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_port *devlink_port; unsigned long port_index; int err = 0; xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) { err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, cb->extack); if (err) { state->idx = port_index; break; } } return err; } int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one); } static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; if (!devlink_port->ops->port_type_set) return -EOPNOTSUPP; if (port_type == devlink_port->type) return 0; err = devlink_port->ops->port_type_set(devlink_port, port_type); if (err) return err; devlink_port->desired_type = port_type; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } static int devlink_port_function_hw_addr_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { const u8 *hw_addr; int hw_addr_len; hw_addr = nla_data(attr); hw_addr_len = nla_len(attr); if (hw_addr_len > MAX_ADDR_LEN) { NL_SET_ERR_MSG(extack, "Port function hardware address too long"); return -EINVAL; } if (port->type == DEVLINK_PORT_TYPE_ETH) { if (hw_addr_len != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device"); return -EINVAL; } if (!is_unicast_ether_addr(hw_addr)) { NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported"); return -EINVAL; } } return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len, extack); } static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { enum devlink_port_fn_state state; state = nla_get_u8(attr); return port->ops->port_fn_state_set(port, state, extack); } static int devlink_port_function_validate(struct devlink_port *devlink_port, struct nlattr **tb, struct netlink_ext_ack *extack) { const struct devlink_port_ops *ops = devlink_port->ops; struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && !ops->port_fn_hw_addr_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Port doesn't support function attributes"); return -EOPNOTSUPP; } if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], "Function does not support state setting"); return -EOPNOTSUPP; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { struct nla_bitfield32 caps; caps = nla_get_bitfield32(attr); if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && !ops->port_fn_roce_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support RoCE function attribute"); return -EOPNOTSUPP; } if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { if (!ops->port_fn_migratable_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support migratable function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "migratable function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) { if (!ops->port_fn_ipsec_crypto_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_crypto function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_crypto function attribute supported for VFs only"); return -EOPNOTSUPP; } } if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) { if (!ops->port_fn_ipsec_packet_set) { NL_SET_ERR_MSG_ATTR(extack, attr, "Port doesn't support ipsec_packet function attribute"); return -EOPNOTSUPP; } if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { NL_SET_ERR_MSG_ATTR(extack, attr, "ipsec_packet function attribute supported for VFs only"); return -EOPNOTSUPP; } } } return 0; } static int devlink_port_function_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr, devlink_function_nl_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Fail to parse port function attributes"); return err; } err = devlink_port_function_validate(port, tb, extack); if (err) return err; attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; if (attr) { err = devlink_port_fn_caps_set(port, attr, extack); if (err) return err; } /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); return err; } int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) { struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } return 0; } int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; u32 count; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; if (!devlink_port->ops->port_split) return -EOPNOTSUPP; count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]); if (!devlink_port->attrs.splittable) { /* Split ports cannot be split. */ if (devlink_port->attrs.split) NL_SET_ERR_MSG(info->extack, "Port cannot be split further"); else NL_SET_ERR_MSG(info->extack, "Port cannot be split"); return -EINVAL; } if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) { NL_SET_ERR_MSG(info->extack, "Invalid split count"); return -EINVAL; } return devlink_port->ops->port_split(devlink, devlink_port, count, info->extack); } int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_unsplit) return -EOPNOTSUPP; return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack); } int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink_port_new_attrs new_attrs = {}; struct devlink *devlink = info->user_ptr[0]; struct devlink_port *devlink_port; struct sk_buff *msg; int err; if (!devlink->ops->port_new) return -EOPNOTSUPP; if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] || !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) { NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified"); return -EINVAL; } new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]); new_attrs.pfnum = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { /* Port index of the new port being created by driver. */ new_attrs.port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); new_attrs.port_index_valid = true; } if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) { new_attrs.controller = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]); new_attrs.controller_valid = true; } if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF && info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) { new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]); new_attrs.sfnum_valid = true; } err = devlink->ops->port_new(devlink, &new_attrs, extack, &devlink_port); if (err) return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { err = -ENOMEM; goto err_out_port_del; } err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, info->snd_portid, info->snd_seq, 0, NULL); if (WARN_ON_ONCE(err)) goto err_out_msg_free; err = genlmsg_reply(msg, info); if (err) goto err_out_port_del; return 0; err_out_msg_free: nlmsg_free(msg); err_out_port_del: devlink_port->ops->port_del(devlink, devlink_port, NULL); return err; } int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (!devlink_port->ops->port_del) return -EOPNOTSUPP; return devlink_port->ops->port_del(devlink, devlink_port, extack); } static void devlink_port_type_warn(struct work_struct *work) { struct devlink_port *port = container_of(to_delayed_work(work), struct devlink_port, type_warn_dw); dev_warn(port->devlink->dev, "Type was not set for devlink port."); } static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; /* Schedule a work to WARN in case driver does not set port * type within timeout. */ schedule_delayed_work(&devlink_port->type_warn_dw, DEVLINK_PORT_TYPE_WARN_TIMEOUT); } static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) { if (!devlink_port_type_should_warn(devlink_port)) return; cancel_delayed_work_sync(&devlink_port->type_warn_dw); } /** * devlink_port_init() - Init devlink port * * @devlink: devlink * @devlink_port: devlink port * * Initialize essential stuff that is needed for functions * that may be called before devlink port registration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_init(struct devlink *devlink, struct devlink_port *devlink_port) { if (devlink_port->initialized) return; devlink_port->devlink = devlink; INIT_LIST_HEAD(&devlink_port->region_list); devlink_port->initialized = true; } EXPORT_SYMBOL_GPL(devlink_port_init); /** * devlink_port_fini() - Deinitialize devlink port * * @devlink_port: devlink port * * Deinitialize essential stuff that is in use for functions * that may be called after devlink port unregistration. * Call to this function is optional and not needed * in case the driver does not use such functions. */ void devlink_port_fini(struct devlink_port *devlink_port) { WARN_ON(!list_empty(&devlink_port->region_list)); } EXPORT_SYMBOL_GPL(devlink_port_fini); static const struct devlink_port_ops devlink_port_dummy_ops = {}; /** * devl_port_register_with_ops() - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. */ int devl_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_assert_locked(devlink); ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port_init(devlink, devlink_port); devlink_port->registered = true; devlink_port->index = port_index; devlink_port->ops = ops ? ops : &devlink_port_dummy_ops; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); if (err) { devlink_port->registered = false; return err; } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_port_register_with_ops); /** * devlink_port_register_with_ops - Register devlink port * * @devlink: devlink * @devlink_port: devlink port * @port_index: driver-specific numerical identifier of the port * @ops: port ops * * Register devlink port with provided port index. User can use * any indexing, even hw-related one. devlink_port structure * is convenient to be embedded inside user driver private structure. * Note that the caller should take care of zeroing the devlink_port * structure. * * Context: Takes and release devlink->lock <mutex>. */ int devlink_port_register_with_ops(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index, const struct devlink_port_ops *ops) { int err; devl_lock(devlink); err = devl_port_register_with_ops(devlink, devlink_port, port_index, ops); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_port_register_with_ops); /** * devl_port_unregister() - Unregister devlink port * * @devlink_port: devlink port */ void devl_port_unregister(struct devlink_port *devlink_port) { lockdep_assert_held(&devlink_port->devlink->lock); WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); /** * devlink_port_unregister - Unregister devlink port * * @devlink_port: devlink port * * Context: Takes and release devlink->lock <mutex>. */ void devlink_port_unregister(struct devlink_port *devlink_port) { struct devlink *devlink = devlink_port->devlink; devl_lock(devlink); devl_port_unregister(devlink_port); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_port_unregister); static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, struct net_device *netdev) { const struct net_device_ops *ops = netdev->netdev_ops; /* If driver registers devlink port, it should set devlink port * attributes accordingly so the compat functions are called * and the original ops are not used. */ if (ops->ndo_get_phys_port_name) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_phys_port_name * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ char name[IFNAMSIZ]; int err; err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name)); WARN_ON(err != -EOPNOTSUPP); } if (ops->ndo_get_port_parent_id) { /* Some drivers use the same set of ndos for netdevs * that have devlink_port registered and also for * those who don't. Make sure that ndo_get_port_parent_id * returns -EOPNOTSUPP here in case it is defined. * Warn if not. */ struct netdev_phys_item_id ppid; int err; err = ops->ndo_get_port_parent_id(netdev, &ppid); WARN_ON(err != -EOPNOTSUPP); } } static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { struct net_device *netdev = type_dev; ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (type == DEVLINK_PORT_TYPE_NOTSET) { devlink_port_type_warn_schedule(devlink_port); } else { devlink_port_type_warn_cancel(devlink_port); if (type == DEVLINK_PORT_TYPE_ETH && netdev) devlink_port_type_netdev_checks(devlink_port, netdev); } spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; switch (type) { case DEVLINK_PORT_TYPE_ETH: devlink_port->type_eth.netdev = netdev; if (netdev) { ASSERT_RTNL(); devlink_port->type_eth.ifindex = netdev->ifindex; BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != sizeof(netdev->name)); strcpy(devlink_port->type_eth.ifname, netdev->name); } break; case DEVLINK_PORT_TYPE_IB: devlink_port->type_ib.ibdev = type_dev; break; default: break; } spin_unlock_bh(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } /** * devlink_port_type_eth_set - Set port type to Ethernet * * @devlink_port: devlink port * * If driver is calling this, most likely it is doing something wrong. */ void devlink_port_type_eth_set(struct devlink_port *devlink_port) { dev_warn(devlink_port->devlink->dev, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); /** * devlink_port_type_ib_set - Set port type to InfiniBand * * @devlink_port: devlink port * @ibdev: related IB device */ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); /** * devlink_port_type_clear - Clear port type * * @devlink_port: devlink port * * If driver is calling this for clearing Ethernet type, most likely * it is doing something wrong. */ void devlink_port_type_clear(struct devlink_port *devlink_port) { if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); int devlink_port_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct devlink_port *devlink_port = netdev->devlink_port; struct devlink *devlink; if (!devlink_port) return NOTIFY_OK; devlink = devlink_port->devlink; switch (event) { case NETDEV_POST_INIT: /* Set the type but not netdev pointer. It is going to be set * later on by NETDEV_REGISTER event. Happens once during * netdevice register */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); break; case NETDEV_REGISTER: case NETDEV_CHANGENAME: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this * namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, netdev); break; case NETDEV_UNREGISTER: if (devlink_net(devlink) != dev_net(netdev)) return NOTIFY_OK; /* Clear netdev pointer, but not the type. This event happens * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. */ __devlink_port_type_set(devlink_port, devlink_port->type, NULL); break; case NETDEV_PRE_UNINIT: /* Clear the type and the netdev pointer. Happens one during * netdevice unregister. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); break; } return NOTIFY_OK; } static int __devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { devlink_port->switch_port = true; if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN)) attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN; } else { devlink_port->switch_port = false; } return 0; } /** * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs) { int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) return; WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); /** * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); /** * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); /** * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) return; attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (!devlink_port) return; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index, u32 rel_index) { struct devlink_port *devlink_port; devlink_port = devlink_port_get_by_index(devlink, port_index); if (devlink_port && devlink_port->rel_index == rel_index) devlink_port->rel_index = 0; } /** * devl_port_fn_devlink_set - Attach peer devlink * instance to port function. * @devlink_port: devlink port * @fn_devlink: devlink instance to attach */ int devl_port_fn_devlink_set(struct devlink_port *devlink_port, struct devlink *fn_devlink) { ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF || devlink_port->attrs.pci_sf.external)) return -EINVAL; return devlink_rel_nested_in_add(&devlink_port->rel_index, devlink_port->devlink->index, devlink_port->index, devlink_port_rel_notify_cb, devlink_port_rel_cleanup_cb, fn_devlink); } EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set); /** * devlink_port_linecard_set - Link port with a linecard * * @devlink_port: devlink port * @linecard: devlink linecard */ void devlink_port_linecard_set(struct devlink_port *devlink_port, struct devlink_linecard *linecard) { ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->linecard = linecard; } EXPORT_SYMBOL_GPL(devlink_port_linecard_set); static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, char *name, size_t len) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; if (!devlink_port->attrs_set) return -EOPNOTSUPP; switch (attrs->flavour) { case DEVLINK_PORT_FLAVOUR_PHYSICAL: if (devlink_port->linecard) n = snprintf(name, len, "l%u", devlink_linecard_index(devlink_port->linecard)); if (n < len) n += snprintf(name + n, len - n, "p%u", attrs->phys.port_number); if (n < len && attrs->split) n += snprintf(name + n, len - n, "s%u", attrs->phys.split_subport_number); break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ WARN_ON(1); return -EINVAL; case DEVLINK_PORT_FLAVOUR_PCI_PF: if (attrs->pci_pf.external) { n = snprintf(name, len, "c%u", attrs->pci_pf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%u", attrs->pci_pf.pf); break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (attrs->pci_vf.external) { n = snprintf(name, len, "c%u", attrs->pci_vf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%uvf%u", attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: if (attrs->pci_sf.external) { n = snprintf(name, len, "c%u", attrs->pci_sf.controller); if (n >= len) return -EINVAL; len -= n; name += n; } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; case DEVLINK_PORT_FLAVOUR_VIRTUAL: return -EOPNOTSUPP; } if (n >= len) return -EINVAL; return 0; } int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len) { struct devlink_port *devlink_port; /* RTNL mutex is held here which ensures that devlink_port * instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ ASSERT_RTNL(); devlink_port = dev->devlink_port; if (!devlink_port) return -EOPNOTSUPP; return __devlink_port_phys_port_name_get(devlink_port, name, len); } int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid) { struct devlink_port *devlink_port; /* Caller must hold RTNL mutex or reference to dev, which ensures that * devlink_port instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ devlink_port = dev->devlink_port; if (!devlink_port || !devlink_port->switch_port) return -EOPNOTSUPP; memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid)); return 0; } |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | // SPDX-License-Identifier: GPL-2.0 /* * bsg.c - block layer implementation of the sg v4 interface */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/blkdev.h> #include <linux/cdev.h> #include <linux/jiffies.h> #include <linux/percpu.h> #include <linux/idr.h> #include <linux/bsg.h> #include <linux/slab.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/sg.h> #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" #define BSG_VERSION "0.4" struct bsg_device { struct request_queue *queue; struct device device; struct cdev cdev; int max_queue; unsigned int timeout; unsigned int reserved_size; bsg_sg_io_fn *sg_io_fn; }; static inline struct bsg_device *to_bsg_device(struct inode *inode) { return container_of(inode->i_cdev, struct bsg_device, cdev); } #define BSG_DEFAULT_CMDS 64 #define BSG_MAX_DEVS (1 << MINORBITS) static DEFINE_IDA(bsg_minor_ida); static const struct class bsg_class; static int bsg_major; static unsigned int bsg_timeout(struct bsg_device *bd, struct sg_io_v4 *hdr) { unsigned int timeout = BLK_DEFAULT_SG_TIMEOUT; if (hdr->timeout) timeout = msecs_to_jiffies(hdr->timeout); else if (bd->timeout) timeout = bd->timeout; return max_t(unsigned int, timeout, BLK_MIN_SG_TIMEOUT); } static int bsg_sg_io(struct bsg_device *bd, bool open_for_write, void __user *uarg) { struct sg_io_v4 hdr; int ret; if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; if (hdr.guard != 'Q') return -EINVAL; ret = bd->sg_io_fn(bd->queue, &hdr, open_for_write, bsg_timeout(bd, &hdr)); if (!ret && copy_to_user(uarg, &hdr, sizeof(hdr))) return -EFAULT; return ret; } static int bsg_open(struct inode *inode, struct file *file) { if (!blk_get_queue(to_bsg_device(inode)->queue)) return -ENXIO; return 0; } static int bsg_release(struct inode *inode, struct file *file) { blk_put_queue(to_bsg_device(inode)->queue); return 0; } static int bsg_get_command_q(struct bsg_device *bd, int __user *uarg) { return put_user(READ_ONCE(bd->max_queue), uarg); } static int bsg_set_command_q(struct bsg_device *bd, int __user *uarg) { int max_queue; if (get_user(max_queue, uarg)) return -EFAULT; if (max_queue < 1) return -EINVAL; WRITE_ONCE(bd->max_queue, max_queue); return 0; } static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct bsg_device *bd = to_bsg_device(file_inode(file)); struct request_queue *q = bd->queue; void __user *uarg = (void __user *) arg; int __user *intp = uarg; int val; switch (cmd) { /* * Our own ioctls */ case SG_GET_COMMAND_Q: return bsg_get_command_q(bd, uarg); case SG_SET_COMMAND_Q: return bsg_set_command_q(bd, uarg); /* * SCSI/sg ioctls */ case SG_GET_VERSION_NUM: return put_user(30527, intp); case SCSI_IOCTL_GET_IDLUN: return put_user(0, intp); case SCSI_IOCTL_GET_BUS_NUMBER: return put_user(0, intp); case SG_SET_TIMEOUT: if (get_user(val, intp)) return -EFAULT; bd->timeout = clock_t_to_jiffies(val); return 0; case SG_GET_TIMEOUT: return jiffies_to_clock_t(bd->timeout); case SG_GET_RESERVED_SIZE: return put_user(min(bd->reserved_size, queue_max_bytes(q)), intp); case SG_SET_RESERVED_SIZE: if (get_user(val, intp)) return -EFAULT; if (val < 0) return -EINVAL; bd->reserved_size = min_t(unsigned int, val, queue_max_bytes(q)); return 0; case SG_EMULATED_HOST: return put_user(1, intp); case SG_IO: return bsg_sg_io(bd, file->f_mode & FMODE_WRITE, uarg); case SCSI_IOCTL_SEND_COMMAND: pr_warn_ratelimited("%s: calling unsupported SCSI_IOCTL_SEND_COMMAND\n", current->comm); return -EINVAL; default: return -ENOTTY; } } static const struct file_operations bsg_fops = { .open = bsg_open, .release = bsg_release, .unlocked_ioctl = bsg_ioctl, .compat_ioctl = compat_ptr_ioctl, .owner = THIS_MODULE, .llseek = default_llseek, }; static void bsg_device_release(struct device *dev) { struct bsg_device *bd = container_of(dev, struct bsg_device, device); ida_free(&bsg_minor_ida, MINOR(bd->device.devt)); kfree(bd); } void bsg_unregister_queue(struct bsg_device *bd) { struct gendisk *disk = bd->queue->disk; if (disk && disk->queue_kobj.sd) sysfs_remove_link(&disk->queue_kobj, "bsg"); cdev_device_del(&bd->cdev, &bd->device); put_device(&bd->device); } EXPORT_SYMBOL_GPL(bsg_unregister_queue); struct bsg_device *bsg_register_queue(struct request_queue *q, struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn) { struct bsg_device *bd; int ret; bd = kzalloc(sizeof(*bd), GFP_KERNEL); if (!bd) return ERR_PTR(-ENOMEM); bd->max_queue = BSG_DEFAULT_CMDS; bd->reserved_size = INT_MAX; bd->queue = q; bd->sg_io_fn = sg_io_fn; ret = ida_alloc_max(&bsg_minor_ida, BSG_MAX_DEVS - 1, GFP_KERNEL); if (ret < 0) { if (ret == -ENOSPC) dev_err(parent, "bsg: too many bsg devices\n"); kfree(bd); return ERR_PTR(ret); } bd->device.devt = MKDEV(bsg_major, ret); bd->device.class = &bsg_class; bd->device.parent = parent; bd->device.release = bsg_device_release; dev_set_name(&bd->device, "%s", name); device_initialize(&bd->device); cdev_init(&bd->cdev, &bsg_fops); bd->cdev.owner = THIS_MODULE; ret = cdev_device_add(&bd->cdev, &bd->device); if (ret) goto out_put_device; if (q->disk && q->disk->queue_kobj.sd) { ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj, "bsg"); if (ret) goto out_device_del; } return bd; out_device_del: cdev_device_del(&bd->cdev, &bd->device); out_put_device: put_device(&bd->device); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(bsg_register_queue); static char *bsg_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); } static const struct class bsg_class = { .name = "bsg", .devnode = bsg_devnode, }; static int __init bsg_init(void) { dev_t devid; int ret; ret = class_register(&bsg_class); if (ret) return ret; ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); if (ret) goto destroy_bsg_class; bsg_major = MAJOR(devid); printk(KERN_INFO BSG_DESCRIPTION " version " BSG_VERSION " loaded (major %d)\n", bsg_major); return 0; destroy_bsg_class: class_unregister(&bsg_class); return ret; } MODULE_AUTHOR("Jens Axboe"); MODULE_DESCRIPTION(BSG_DESCRIPTION); MODULE_LICENSE("GPL"); device_initcall(bsg_init); |
| 337 63 338 1023 1 63 162 162 161 161 44 337 63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> struct kernel_clone_args; #ifdef CONFIG_CGROUPS /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ #ifdef CONFIG_DEBUG_CGROUP_REF void css_get(struct cgroup_subsys_state *css); void css_get_many(struct cgroup_subsys_state *css, unsigned int n); bool css_tryget(struct cgroup_subsys_state *css); bool css_tryget_online(struct cgroup_subsys_state *css); void css_put(struct cgroup_subsys_state *css); void css_put_many(struct cgroup_subsys_state *css, unsigned int n); #else #define CGROUP_REF_FN_ATTRS static inline #define CGROUP_REF_EXPORT(fn) #include <linux/cgroup_refcnt.h> #endif static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } extern struct mutex cgroup_mutex; static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } static inline void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css; rcu_read_lock(); while (true) { css = task_css(task, subsys_id); /* * Can't use css_tryget_online() here. A task which has * PF_EXITING set may stay associated with an offline css. * If such task calls this function, css_tryget_online() * will keep failing. */ if (likely(css_tryget(css))) break; cpu_relax(); } rcu_read_unlock(); return css; } /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID * * Test whether @task belongs to the root css on the specified subsystem. * May be invoked in any context. */ static inline bool task_css_is_root(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, true) == init_css_set.subsys[subsys_id]; } static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { return task_css(task, subsys_id)->cgroup; } static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) { return task_css_set(task)->dfl_cgrp; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { struct cgroup_subsys_state *parent_css = cgrp->self.parent; if (parent_css) return container_of(parent_css, struct cgroup, self); return NULL; } /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested * @ancestor: possible ancestor of @cgrp * * Test whether @cgrp is a descendant of @ancestor. It also returns %true * if @cgrp == @ancestor. This function is safe to call as long as @cgrp * and @ancestor are accessible. */ static inline bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; return cgrp->ancestors[ancestor->level] == ancestor; } /** * cgroup_ancestor - find ancestor of cgroup * @cgrp: cgroup to find ancestor of * @ancestor_level: level of ancestor to find starting from root * * Find ancestor of cgroup at specified level starting from root if it exists * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at * @ancestor_level. * * This function is safe to call as long as @cgrp is accessible. */ static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { if (ancestor_level < 0 || ancestor_level > cgrp->level) return NULL; return cgrp->ancestors[ancestor_level]; } /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { struct css_set *cset = task_css_set(task); return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ static inline struct cftype *of_cft(struct kernfs_open_file *of) { return of->kn->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); /* cft/css accessors for cftype->seq_*() operations */ static inline struct cftype *seq_cft(struct seq_file *seq) { return of_cft(seq->private); } static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { return of_css(seq->private); } /* * Name / path handling functions. All are thin wrappers around the kernfs * counterparts and can be called under any context. */ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_name(cgrp->kn, buf, buflen); } static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { pr_cont_kernfs_path(cgrp->kn); } bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) { /* * kthreadd is inherited by all kthreads, keep it in the root so * that the new kthreads are guaranteed to stay in the root until * initialization is finished. */ current->no_cgroup_migration = 1; } static inline void cgroup_kthread_ready(void) { /* * This kthread finished initialization. The creator should have * set PF_NO_SETAFFINITY if this kthread should stay in the root. */ current->no_cgroup_migration = 0; } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline void cgroup_lock(void) {} static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { return NULL; } static inline bool cgroup_psi_enabled(void) { return false; } static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { return true; } static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS /* * cgroup scalable recursive statistics. */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); /* * Basic resource stats. */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { struct cgroup *cgrp; cpuacct_charge(task, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup *cgrp; cpuacct_account_field(task, index, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); } #else /* CONFIG_CGROUPS */ static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) {} static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) {} #endif /* CONFIG_CGROUPS */ /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; struct css_set *root_cset; }; extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; } #endif /* !CONFIG_CGROUPS */ static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) free_cgroup_ns(ns); } #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); void cgroup_leave_frozen(bool always_leave); void cgroup_update_frozen(struct cgroup *cgrp); void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; } #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } static inline bool cgroup_task_frozen(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUP_BPF static inline void cgroup_bpf_get(struct cgroup *cgrp) { percpu_ref_get(&cgrp->bpf.refcnt); } static inline void cgroup_bpf_put(struct cgroup *cgrp) { percpu_ref_put(&cgrp->bpf.refcnt); } #else /* CONFIG_CGROUP_BPF */ static inline void cgroup_bpf_get(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); #endif /* _LINUX_CGROUP_H */ |
| 26 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the Ethernet IEEE 802.3 interface. * * Version: @(#)if_ether.h 1.0.1a 02/08/94 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk> */ #ifndef _LINUX_IF_ETHER_H #define _LINUX_IF_ETHER_H #include <linux/skbuff.h> #include <uapi/linux/if_ether.h> static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + eth_hdr() */ static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->data; } static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_inner_mac_header(skb); } int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); #endif /* _LINUX_IF_ETHER_H */ |
| 306 306 305 306 306 304 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 ARM Ltd. * * Generic implementation of update_vsyscall and update_vsyscall_tz. * * Based on the x86 specific implementation. */ #include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <vdso/helpers.h> #include <vdso/vsyscall.h> #include "timekeeping_internal.h" static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; vdata[CS_RAW].mask = tk->tkr_raw.mask; vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* Copy MONOTONIC time for BOOTTIME */ sec = vdso_ts->sec; /* Add the boot offset */ sec += tk->monotonic_to_boot.tv_sec; nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata[CS_HRES_COARSE].clock_mode = clock_mode; vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). * Note: No need to have a second copy. */ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); } void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } /** * vdso_update_begin - Start of a VDSO update section * * Allows architecture code to safely update the architecture specific VDSO * data. Disables interrupts, acquires timekeeper lock to serialize against * concurrent updates from timekeeping and invalidates the VDSO data * sequence counter to prevent concurrent readers from accessing * inconsistent data. * * Returns: Saved interrupt flags which need to be handed in to * vdso_update_end(). */ unsigned long vdso_update_begin(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); vdso_write_begin(vdata); return flags; } /** * vdso_update_end - End of a VDSO update section * @flags: Interrupt flags as returned from vdso_update_begin() * * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data * synchronization if the architecture requires it, drops timekeeper lock * and restores interrupt flags. */ void vdso_update_end(unsigned long flags) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } |
| 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed); |
| 7 7 7 7 7 7 2 2 2 194 7 199 195 7 7 2 4 4 1 4 4 199 199 1 1 1 1 1 1 27 27 27 1 1 1 27 4 1 3 1 3 4 4 4 1 4 4 3 1 4 4 1 1 1 1 1 1 1 1 1 1 6 4 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 | // SPDX-License-Identifier: GPL-2.0-or-later /* * RAW sockets for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Adapted from linux/net/ipv4/raw.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/slab.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/skbuff.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #include <linux/mroute6.h> #include <net/raw.h> #include <net/rawv6.h> #include <net/xfrm.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ struct raw_hashinfo raw_v6_hashinfo; EXPORT_SYMBOL_GPL(raw_v6_hashinfo); bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num, const struct in6_addr *loc_addr, const struct in6_addr *rmt_addr, int dif, int sdif) { if (inet_sk(sk)->inet_num != num || !net_eq(sock_net(sk), net) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) || (ipv6_addr_is_multicast(loc_addr) && inet6_mc_check(sk, loc_addr, rmt_addr))) return true; return false; } EXPORT_SYMBOL_GPL(raw_v6_match); /* * 0 - deliver * 1 - block */ static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) { struct icmp6hdr _hdr; const struct icmp6hdr *hdr; /* We require only the four bytes of the ICMPv6 header, not any * additional bytes of message body in "struct icmp6hdr". */ hdr = skb_header_pointer(skb, skb_transport_offset(skb), ICMPV6_HDRLEN, &_hdr); if (hdr) { const __u32 *data = &raw6_sk(sk)->filter.data[0]; unsigned int type = hdr->icmp6_type; return (data[type >> 5] & (1U << (type & 31))) != 0; } return 1; } #if IS_ENABLED(CONFIG_IPV6_MIP6) typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb); static mh_filter_t __rcu *mh_filter __read_mostly; int rawv6_mh_filter_register(mh_filter_t filter) { rcu_assign_pointer(mh_filter, filter); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_register); int rawv6_mh_filter_unregister(mh_filter_t filter) { RCU_INIT_POINTER(mh_filter, NULL); synchronize_rcu(); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_unregister); #endif /* * demultiplex raw sockets. * (should consider queueing the skb in the sock receive_queue * without calling rawv6.c) * * Caller owns SKB so we must make clones. */ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); const struct in6_addr *saddr; const struct in6_addr *daddr; struct hlist_head *hlist; struct sock *sk; bool delivered = false; __u8 hash; saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { int filtered; if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, inet6_iif(skb), inet6_sdif(skb))) continue; delivered = true; switch (nexthdr) { case IPPROTO_ICMPV6: filtered = icmpv6_filter(sk, skb); break; #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: { /* XXX: To validate MH only once for each packet, * this is placed here. It should be after checking * xfrm policy, however it doesn't. The checking xfrm * policy is placed in rawv6_rcv() because it is * required for each socket. */ mh_filter_t *filter; filter = rcu_dereference(mh_filter); filtered = filter ? (*filter)(sk, skb) : 0; break; } #endif default: filtered = 0; break; } if (filtered < 0) break; if (filtered == 0) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) rawv6_rcv(sk, clone); } } rcu_read_unlock(); return delivered; } bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) { return ipv6_raw_deliver(skb, nexthdr); } /* This cleans up af_inet6 a bit. -DaveM */ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; __be32 v4addr = 0; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (addr->sin6_family != AF_INET6) return -EINVAL; addr_type = ipv6_addr_type(&addr->sin6_addr); /* Raw sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (sk->sk_state != TCP_CLOSE) goto out; rcu_read_lock(); /* Check if the address belongs to the host. */ if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another * one is supplied by user. */ sk->sk_bound_dev_if = addr->sin6_scope_id; } /* Binding to link-local address requires an interface */ if (!sk->sk_bound_dev_if) goto out_unlock; } if (sk->sk_bound_dev_if) { err = -ENODEV; dev = dev_get_by_index_rcu(sock_net(sk), sk->sk_bound_dev_if); if (!dev) goto out_unlock; } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST) && !ipv6_can_nonlocal_bind(sock_net(sk), inet)) { err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, dev, 0)) { goto out_unlock; } } } inet->inet_rcv_saddr = inet->inet_saddr = v4addr; sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; err = 0; out_unlock: rcu_read_unlock(); out: release_sock(sk); return err; } static void rawv6_err(struct sock *sk, struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { bool recverr = inet6_test_bit(RECVERR6, sk); struct ipv6_pinfo *np = inet6_sk(sk); int err; int harderr; /* Report error on raw socket, if: 1. User requested recverr. 2. Socket is connected (otherwise the error indication is useless without recverr and error is hard. */ if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; harderr = icmpv6_err_convert(type, code, &err); if (type == ICMPV6_PKT_TOOBIG) { ip6_sk_update_pmtu(skb, sk, info); harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO); } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); return; } if (recverr) { u8 *payload = skb->data; if (!inet_test_bit(HDRINCL, sk)) payload += offset; ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); } if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } } void raw6_icmp_error(struct sk_buff *skb, int nexthdr, u8 type, u8 code, int inner_offset, __be32 info) { struct net *net = dev_net(skb->dev); struct hlist_head *hlist; struct sock *sk; int hash; hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr, inet6_iif(skb), inet6_iif(skb))) continue; rawv6_err(sk, skb, NULL, type, code, inner_offset, info); } rcu_read_unlock(); } static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { kfree_skb_reason(skb, reason); return NET_RX_DROP; } return 0; } /* * This is next to useless... * if we demultiplex in network layer we don't need the extra call * just to queue the skb... * maybe we could have the network decide upon a hint if it * should call raw_rcv for demultiplexing */ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); if (!rp->checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; if (skb->ip_summed == CHECKSUM_COMPLETE) { skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, inet->inet_num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } if (!skb_csum_unnecessary(skb)) skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, inet->inet_num, 0)); if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } rawv6_rcv_skb(sk, skb); return 0; } /* * This should be easy, if there is something there * we return it, otherwise we block. */ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct sk_buff *skb; size_t copied; int err; if (flags & MSG_OOB) return -EOPNOTSUPP; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); if (np->rxpmtu && np->rxopt.bits.rxpmtu) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } if (skb_csum_unnecessary(skb)) { err = skb_copy_datagram_msg(skb, 0, msg, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_msg(skb, 0, msg, copied); } else { err = skb_copy_and_csum_datagram_msg(skb, 0, msg); if (err == -EINVAL) goto csum_copy_err; } if (err) goto out_free; /* Copy the address. */ if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); *addr_len = sizeof(*sin6); } sock_recv_cmsgs(msg, sk, skb); if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); err = copied; if (flags & MSG_TRUNC) err = skb->len; out_free: skb_free_datagram(sk, skb); out: return err; csum_copy_err: skb_kill_datagram(sk, skb, flags); /* Error for blocking case is chosen to masquerade as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; goto out; } static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct raw6_sock *rp) { struct ipv6_txoptions *opt; struct sk_buff *skb; int err = 0; int offset; int len; int total_len; __wsum tmp_csum; __sum16 csum; if (!rp->checksum) goto send; skb = skb_peek(&sk->sk_write_queue); if (!skb) goto out; offset = rp->offset; total_len = inet_sk(sk)->cork.base.length; opt = inet6_sk(sk)->cork.opt; total_len -= opt ? opt->opt_flen : 0; if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); goto out; } /* should be check HW csum miyazawa */ if (skb_queue_len(&sk->sk_write_queue) == 1) { /* * Only one fragment on the socket. */ tmp_csum = skb->csum; } else { struct sk_buff *csum_skb = NULL; tmp_csum = 0; skb_queue_walk(&sk->sk_write_queue, skb) { tmp_csum = csum_add(tmp_csum, skb->csum); if (csum_skb) continue; len = skb->len - skb_transport_offset(skb); if (offset >= len) { offset -= len; continue; } csum_skb = skb; } skb = csum_skb; } offset += skb_transport_offset(skb); err = skb_copy_bits(skb, offset, &csum, 2); if (err < 0) { ip6_flush_pending_frames(sk); goto out; } /* in case cksum was not initialized */ if (unlikely(csum)) tmp_csum = csum_sub(tmp_csum, csum_unfold(csum)); csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, total_len, fl6->flowi6_proto, tmp_csum); if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) csum = CSUM_MANGLED_0; BUG_ON(skb_store_bits(skb, offset, &csum, 2)); send: err = ip6_push_pending_frames(sk); out: return err; } static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, unsigned int flags, const struct sockcm_cookie *sockc) { struct net *net = sock_net(sk); struct ipv6hdr *iph; struct sk_buff *skb; int err; struct rt6_info *rt = (struct rt6_info *)*dstp; int hlen = LL_RESERVED_SPACE(rt->dst.dev); int tlen = rt->dst.dev->needed_tailroom; if (length > rt->dst.dev->mtu) { ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } if (length < sizeof(struct ipv6hdr)) return -EINVAL; if (flags&MSG_PROBE) goto out; skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (!skb) goto error; skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc->mark; skb->tstamp = sockc->transmit_time; skb_put(skb, length); skb_reset_network_header(skb); iph = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_NONE; skb_setup_tx_timestamp(skb, sockc->tsflags); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); if (err) { err = -EFAULT; kfree_skb(skb); goto error; } skb_dst_set(skb, &rt->dst); *dstp = NULL; /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out(sk, skb); if (unlikely(!skb)) return 0; /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev * in the error path. Since skb has been freed, the dst could * have been queued for deletion. */ rcu_read_lock(); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) { IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); goto error_check; } rcu_read_unlock(); out: return 0; error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); error_check: if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) err = 0; return err; } struct raw6_frag_vec { struct msghdr *msg; int hlen; char c[4]; }; static int rawv6_probe_proto_opt(struct raw6_frag_vec *rfv, struct flowi6 *fl6) { int err = 0; switch (fl6->flowi6_proto) { case IPPROTO_ICMPV6: rfv->hlen = 2; err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen); if (!err) { fl6->fl6_icmp_type = rfv->c[0]; fl6->fl6_icmp_code = rfv->c[1]; } break; case IPPROTO_MH: rfv->hlen = 4; err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen); if (!err) fl6->fl6_mh_type = rfv->c[2]; } return err; } static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct raw6_frag_vec *rfv = from; if (offset < rfv->hlen) { int copy = min(rfv->hlen - offset, len); if (skb->ip_summed == CHECKSUM_PARTIAL) memcpy(to, rfv->c + offset, copy); else skb->csum = csum_block_add( skb->csum, csum_partial_copy_nocheck(rfv->c + offset, to, copy), odd); odd = 0; offset += copy; to += copy; len -= copy; if (!len) return 0; } offset -= rfv->hlen; return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct raw6_sock *rp = raw6_sk(sk); struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int hdrincl; u16 proto; int err; /* Rough check on arithmetic overflow, better check is made in ip6_append_data(). */ if (len > INT_MAX) return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; hdrincl = inet_test_bit(HDRINCL, sk); /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (sin6->sin6_family && sin6->sin6_family != AF_INET6) return -EAFNOSUPPORT; /* port is the proto value [0..255] carried in nexthdr */ proto = ntohs(sin6->sin6_port); if (!proto) proto = inet->inet_num; else if (proto != inet->inet_num && inet->inet_num != IPPROTO_RAW) return -EINVAL; if (proto > 255) return -EINVAL; daddr = &sin6->sin6_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* * Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; proto = inet->inet_num; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); fl6.flowi6_proto = proto; fl6.flowi6_mark = ipc6.sockc.mark; if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; err = rawv6_probe_proto_opt(&rfv, &fl6); if (err) goto out; } if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; if (ipc6.tclass < 0) ipc6.tclass = np->tclass; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (ipc6.dontfrag < 0) ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: if (hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags, &ipc6.sockc); else { ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, len, 0, &ipc6, &fl6, (struct rt6_info *)dst, msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) err = rawv6_push_pending_frames(sk, &fl6, rp); release_sock(sk); } done: dst_release(dst); out: fl6_sock_release(flowlabel); txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, sockptr_t optval, int optlen) { switch (optname) { case ICMPV6_FILTER: if (optlen > sizeof(struct icmp6_filter)) optlen = sizeof(struct icmp6_filter); if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; default: return -ENOPROTOOPT; } return 0; } static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int len; switch (optname) { case ICMPV6_FILTER: if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; if (len > sizeof(struct icmp6_filter)) len = sizeof(struct icmp6_filter); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &raw6_sk(sk)->filter, len)) return -EFAULT; return 0; default: return -ENOPROTOOPT; } return 0; } static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct raw6_sock *rp = raw6_sk(sk); int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case IPV6_HDRINCL: if (sk->sk_type != SOCK_RAW) return -EINVAL; inet_assign_bit(HDRINCL, sk, val); return 0; case IPV6_CHECKSUM: if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && level == IPPROTO_IPV6) { /* * RFC3542 tells that IPV6_CHECKSUM socket * option in the IPPROTO_IPV6 level is not * allowed on ICMPv6 sockets. * If you want to set it, use IPPROTO_RAW * level IPV6_CHECKSUM socket option * (Linux extension). */ return -EINVAL; } /* You may get strange result with a positive odd offset; RFC2292bis agrees with me. */ if (val > 0 && (val&1)) return -EINVAL; if (val < 0) { rp->checksum = 0; } else { rp->checksum = 1; rp->offset = val; } return 0; default: return -ENOPROTOOPT; } } static int rawv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { switch (level) { case SOL_RAW: break; case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; return rawv6_seticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; fallthrough; default: return ipv6_setsockopt(sk, level, optname, optval, optlen); } return do_rawv6_setsockopt(sk, level, optname, optval, optlen); } static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct raw6_sock *rp = raw6_sk(sk); int val, len; if (get_user(len, optlen)) return -EFAULT; switch (optname) { case IPV6_HDRINCL: val = inet_test_bit(HDRINCL, sk); break; case IPV6_CHECKSUM: /* * We allow getsockopt() for IPPROTO_IPV6-level * IPV6_CHECKSUM socket option on ICMPv6 sockets * since RFC3542 is silent about it. */ if (rp->checksum == 0) val = -1; else val = rp->offset; break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { switch (level) { case SOL_RAW: break; case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; return rawv6_geticmpfilter(sk, level, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; fallthrough; default: return ipv6_getsockopt(sk, level, optname, optval, optlen); } return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } static int rawv6_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) *karg = skb->len; else *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } default: #ifdef CONFIG_IPV6_MROUTE return ip6mr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif } } #ifdef CONFIG_COMPAT static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: case SIOCINQ: return -ENOIOCTLCMD; default: #ifdef CONFIG_IPV6_MROUTE return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg)); #else return -ENOIOCTLCMD; #endif } } #endif static void rawv6_close(struct sock *sk, long timeout) { if (inet_sk(sk)->inet_num == IPPROTO_RAW) ip6_ra_control(sk, -1); ip6mr_sk_done(sk); sk_common_release(sk); } static void raw6_destroy(struct sock *sk) { lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); } static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; rp->offset = 2; break; case IPPROTO_MH: rp->checksum = 1; rp->offset = 4; break; default: break; } return 0; } struct proto rawv6_prot = { .name = "RAWv6", .owner = THIS_MODULE, .close = rawv6_close, .destroy = raw6_destroy, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, .ioctl = rawv6_ioctl, .init = rawv6_init_sk, .setsockopt = rawv6_setsockopt, .getsockopt = rawv6_getsockopt, .sendmsg = rawv6_sendmsg, .recvmsg = rawv6_recvmsg, .bind = rawv6_bind, .backlog_rcv = rawv6_rcv_skb, .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), .useroffset = offsetof(struct raw6_sock, filter), .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, #ifdef CONFIG_COMPAT .compat_ioctl = compat_rawv6_ioctl, #endif .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS static int raw6_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); } else { struct sock *sp = v; __u16 srcp = inet_sk(sp)->inet_num; ip6_dgram_sock_seq_show(seq, v, srcp, 0, raw_seq_private(seq)->bucket); } return 0; } static const struct seq_operations raw6_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, .show = raw6_seq_show, }; static int __net_init raw6_init_net(struct net *net) { if (!proc_create_net_data("raw6", 0444, net->proc_net, &raw6_seq_ops, sizeof(struct raw_iter_state), &raw_v6_hashinfo)) return -ENOMEM; return 0; } static void __net_exit raw6_exit_net(struct net *net) { remove_proc_entry("raw6", net->proc_net); } static struct pernet_operations raw6_net_ops = { .init = raw6_init_net, .exit = raw6_exit_net, }; int __init raw6_proc_init(void) { return register_pernet_subsys(&raw6_net_ops); } void raw6_proc_exit(void) { unregister_pernet_subsys(&raw6_net_ops); } #endif /* CONFIG_PROC_FS */ /* Same as inet6_dgram_ops, sans udp_poll. */ const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, /* ok */ .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ .recvmsg = sock_common_recvmsg, /* ok */ .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw rawv6_protosw = { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &rawv6_prot, .ops = &inet6_sockraw_ops, .flags = INET_PROTOSW_REUSE, }; int __init rawv6_init(void) { return inet6_register_protosw(&rawv6_protosw); } void rawv6_exit(void) { inet6_unregister_protosw(&rawv6_protosw); } |
| 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <net/udp_tunnel.h> #include <net/sch_generic.h> #include <linux/netfilter.h> #include <rdma/ib_addr.h> #include "rxe.h" #include "rxe_net.h" #include "rxe_loc.h" static struct rxe_recv_sockets recv_sockets; static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, struct net_device *ndev, struct in_addr *saddr, struct in_addr *daddr) { struct rtable *rt; struct flowi4 fl = { { 0 } }; memset(&fl, 0, sizeof(fl)); fl.flowi4_oif = ndev->ifindex; memcpy(&fl.saddr, saddr, sizeof(*saddr)); memcpy(&fl.daddr, daddr, sizeof(*daddr)); fl.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(&init_net, &fl); if (IS_ERR(rt)) { rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); return NULL; } return &rt->dst; } #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { struct dst_entry *ndst; struct flowi6 fl6 = { { 0 } }; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = ndev->ifindex; memcpy(&fl6.saddr, saddr, sizeof(*saddr)); memcpy(&fl6.daddr, daddr, sizeof(*daddr)); fl6.flowi6_proto = IPPROTO_UDP; ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), recv_sockets.sk6->sk, &fl6, NULL); if (IS_ERR(ndst)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); return NULL; } if (unlikely(ndst->error)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); goto put; } return ndst; put: dst_release(ndst); return NULL; } #else static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { return NULL; } #endif static struct dst_entry *rxe_find_route(struct net_device *ndev, struct rxe_qp *qp, struct rxe_av *av) { struct dst_entry *dst = NULL; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); if (!dst || !dst_check(dst, qp->dst_cookie)) { if (dst) dst_release(dst); if (av->network_type == RXE_NETWORK_TYPE_IPV4) { struct in_addr *saddr; struct in_addr *daddr; saddr = &av->sgid_addr._sockaddr_in.sin_addr; daddr = &av->dgid_addr._sockaddr_in.sin_addr; dst = rxe_find_route4(qp, ndev, saddr, daddr); } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { struct in6_addr *saddr6; struct in6_addr *daddr6; saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route6(qp, ndev, saddr6, daddr6); #if IS_ENABLED(CONFIG_IPV6) if (dst) qp->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); #endif } if (dst && (qp_type(qp) == IB_QPT_RC)) { dst_hold(dst); sk_dst_set(qp->sk->sk, dst); } } return dst; } static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct udphdr *udph; struct rxe_dev *rxe; struct net_device *ndev = skb->dev; struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); /* takes a reference on rxe->ib_dev * drop when skb is freed */ rxe = rxe_get_dev_from_net(ndev); if (!rxe && is_vlan_dev(ndev)) rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); if (!rxe) goto drop; if (skb_linearize(skb)) { ib_device_put(&rxe->ib_dev); goto drop; } udph = udp_hdr(skb); pkt->rxe = rxe; pkt->port_num = 1; pkt->hdr = (u8 *)(udph + 1); pkt->mask = RXE_GRH_MASK; pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); /* remove udp header */ skb_pull(skb, sizeof(struct udphdr)); rxe_rcv(skb); return 0; drop: kfree_skb(skb); return 0; } static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, bool ipv6) { int err; struct socket *sock; struct udp_port_cfg udp_cfg = { }; struct udp_tunnel_sock_cfg tnl_cfg = { }; if (ipv6) { udp_cfg.family = AF_INET6; udp_cfg.ipv6_v6only = 1; } else { udp_cfg.family = AF_INET; } udp_cfg.local_udp_port = port; /* Create UDP socket */ err = udp_sock_create(net, &udp_cfg, &sock); if (err < 0) return ERR_PTR(err); tnl_cfg.encap_type = 1; tnl_cfg.encap_rcv = rxe_udp_encap_recv; /* Setup UDP tunnel */ setup_udp_tunnel_sock(net, sock, &tnl_cfg); return sock; } static void rxe_release_udp_tunnel(struct socket *sk) { if (sk) udp_tunnel_sock_release(sk); } static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, __be16 dst_port) { struct udphdr *udph; __skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); udph = udp_hdr(skb); udph->dest = dst_port; udph->source = src_port; udph->len = htons(skb->len); udph->check = 0; } static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, __be32 saddr, __be32 daddr, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { struct iphdr *iph; skb_scrub_packet(skb, xnet); skb_clear_hash(skb); skb_dst_set(skb, dst_clone(dst)); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = IPVERSION; iph->ihl = sizeof(struct iphdr) >> 2; iph->tot_len = htons(skb->len); iph->frag_off = df; iph->protocol = proto; iph->tos = tos; iph->daddr = daddr; iph->saddr = saddr; iph->ttl = ttl; __ip_select_ident(dev_net(dst->dev), iph, skb_shinfo(skb)->gso_segs ?: 1); } static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, struct in6_addr *saddr, struct in6_addr *daddr, __u8 proto, __u8 prio, __u8 ttl) { struct ipv6hdr *ip6h; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_set(skb, dst_clone(dst)); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, htonl(0)); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = proto; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); } static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; bool xnet = false; __be16 df = htons(IP_DF); struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; dst = rxe_find_route(skb->dev, qp, av); if (!dst) { rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit, df, xnet); dst_release(dst); return 0; } static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route(skb->dev, qp, av); if (!dst) { rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit); dst_release(dst); return 0; } int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { int err = 0; if (skb->protocol == htons(ETH_P_IP)) err = prepare4(av, pkt, skb); else if (skb->protocol == htons(ETH_P_IPV6)) err = prepare6(av, pkt, skb); if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) pkt->mask |= RXE_LOOPBACK_MASK; return err; } static void rxe_skb_tx_dtor(struct sk_buff *skb) { struct sock *sk = skb->sk; struct rxe_qp *qp = sk->sk_user_data; int skb_out = atomic_dec_return(&qp->skb_out); if (unlikely(qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) rxe_sched_task(&qp->req.task); rxe_put(qp); } static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) { int err; skb->destructor = rxe_skb_tx_dtor; skb->sk = pkt->qp->sk->sk; rxe_get(pkt->qp); atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) { err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else if (skb->protocol == htons(ETH_P_IPV6)) { err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else { rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n", skb->protocol); atomic_dec(&pkt->qp->skb_out); rxe_put(pkt->qp); kfree_skb(skb); return -EINVAL; } if (unlikely(net_xmit_eval(err))) { rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); return -EAGAIN; } return 0; } /* fix up a send packet to match the packets * received from UDP before looping them back */ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) { memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); if (skb->protocol == htons(ETH_P_IP)) skb_pull(skb, sizeof(struct iphdr)); else skb_pull(skb, sizeof(struct ipv6hdr)); if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev))) { kfree_skb(skb); return -EIO; } /* remove udp header */ skb_pull(skb, sizeof(struct udphdr)); rxe_rcv(skb); return 0; } int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct sk_buff *skb) { int err; int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || (!is_request && (qp_state(qp) < IB_QPS_RTR))) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } spin_unlock_irqrestore(&qp->state_lock, flags); rxe_icrc_generate(skb, pkt); if (pkt->mask & RXE_LOOPBACK_MASK) err = rxe_loopback(skb, pkt); else err = rxe_send(skb, pkt); if (err) { rxe_counter_inc(rxe, RXE_CNT_SEND_ERR); return err; } if ((qp_type(qp) != IB_QPT_RC) && (pkt->mask & RXE_END_MASK)) { pkt->wqe->state = wqe_state_done; rxe_sched_task(&qp->comp.task); } rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); goto done; drop: kfree_skb(skb); err = 0; done: return err; } struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt) { unsigned int hdr_len; struct sk_buff *skb = NULL; struct net_device *ndev; const struct ib_gid_attr *attr; const int port_num = 1; attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); if (IS_ERR(attr)) return NULL; if (av->network_type == RXE_NETWORK_TYPE_IPV4) hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct iphdr); else hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct ipv6hdr); rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); if (IS_ERR(ndev)) { rcu_read_unlock(); goto out; } skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), GFP_ATOMIC); if (unlikely(!skb)) { rcu_read_unlock(); goto out; } skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev)); /* FIXME: hold reference to this netdev until life of this skb. */ skb->dev = ndev; rcu_read_unlock(); if (av->network_type == RXE_NETWORK_TYPE_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); pkt->rxe = rxe; pkt->port_num = port_num; pkt->hdr = skb_put(skb, paylen); pkt->mask |= RXE_GRH_MASK; out: rdma_put_gid_attr(attr); return skb; } /* * this is required by rxe_cfg to match rxe devices in * /sys/class/infiniband up with their underlying ethernet devices */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { return rxe->ndev->name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) { int err; struct rxe_dev *rxe = NULL; rxe = ib_alloc_device(rxe_dev, ib_dev); if (!rxe) return -ENOMEM; rxe->ndev = ndev; err = rxe_add(rxe, ndev->mtu, ibdev_name); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; } return 0; } static void rxe_port_event(struct rxe_dev *rxe, enum ib_event_type event) { struct ib_event ev; ev.device = &rxe->ib_dev; ev.element.port_num = 1; ev.event = event; ib_dispatch_event(&ev); } /* Caller must hold net_info_lock */ void rxe_port_up(struct rxe_dev *rxe) { struct rxe_port *port; port = &rxe->port; port->attr.state = IB_PORT_ACTIVE; rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); dev_info(&rxe->ib_dev.dev, "set active\n"); } /* Caller must hold net_info_lock */ void rxe_port_down(struct rxe_dev *rxe) { struct rxe_port *port; port = &rxe->port; port->attr.state = IB_PORT_DOWN; rxe_port_event(rxe, IB_EVENT_PORT_ERR); rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); dev_info(&rxe->ib_dev.dev, "set down\n"); } void rxe_set_port_state(struct rxe_dev *rxe) { if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) rxe_port_up(rxe); else rxe_port_down(rxe); } static int rxe_notify(struct notifier_block *not_blk, unsigned long event, void *arg) { struct net_device *ndev = netdev_notifier_info_to_dev(arg); struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); if (!rxe) return NOTIFY_OK; switch (event) { case NETDEV_UNREGISTER: ib_unregister_device_queued(&rxe->ib_dev); break; case NETDEV_UP: rxe_port_up(rxe); break; case NETDEV_DOWN: rxe_port_down(rxe); break; case NETDEV_CHANGEMTU: rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; case NETDEV_CHANGE: rxe_set_port_state(rxe); break; case NETDEV_REBOOT: case NETDEV_GOING_DOWN: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: default: rxe_dbg_dev(rxe, "ignoring netdev event = %ld for %s\n", event, ndev->name); break; } ib_device_put(&rxe->ib_dev); return NOTIFY_OK; } static struct notifier_block rxe_net_notifier = { .notifier_call = rxe_notify, }; static int rxe_net_ipv4_init(void) { recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false); if (IS_ERR(recv_sockets.sk4)) { recv_sockets.sk4 = NULL; pr_err("Failed to create IPv4 UDP tunnel\n"); return -1; } return 0; } static int rxe_net_ipv6_init(void) { #if IS_ENABLED(CONFIG_IPV6) recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true); if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) { recv_sockets.sk6 = NULL; pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n"); return 0; } if (IS_ERR(recv_sockets.sk6)) { recv_sockets.sk6 = NULL; pr_err("Failed to create IPv6 UDP tunnel\n"); return -1; } #endif return 0; } void rxe_net_exit(void) { rxe_release_udp_tunnel(recv_sockets.sk6); rxe_release_udp_tunnel(recv_sockets.sk4); unregister_netdevice_notifier(&rxe_net_notifier); } int rxe_net_init(void) { int err; recv_sockets.sk6 = NULL; err = rxe_net_ipv4_init(); if (err) return err; err = rxe_net_ipv6_init(); if (err) goto err_out; err = register_netdevice_notifier(&rxe_net_notifier); if (err) { pr_err("Failed to register netdev notifier\n"); goto err_out; } return 0; err_out: rxe_net_exit(); return err; } |
| 37 36 36 37 36 37 507 37 36 37 37 37 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_SCHED_GENERIC_H #define __NET_SCHED_GENERIC_H #include <linux/netdevice.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/pkt_sched.h> #include <linux/pkt_cls.h> #include <linux/percpu.h> #include <linux/dynamic_queue_limits.h> #include <linux/list.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include <linux/hashtable.h> #include <net/gen_stats.h> #include <net/rtnetlink.h> #include <net/flow_offload.h> #include <linux/xarray.h> struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; struct bpf_flow_keys; struct qdisc_rate_table { struct tc_ratespec rate; u32 data[256]; struct qdisc_rate_table *next; int refcnt; }; enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, __QDISC_STATE_MISSED, __QDISC_STATE_DRAINING, }; enum qdisc_state2_t { /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. * Use qdisc_run_begin/end() or qdisc_is_running() instead. */ __QDISC_STATE2_RUNNING, }; #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) #define QDISC_STATE_NON_EMPTY (QDISC_STATE_MISSED | \ QDISC_STATE_DRAINING) struct qdisc_size_table { struct rcu_head rcu; struct list_head list; struct tc_sizespec szopts; int refcnt; u16 data[]; }; /* similar to sk_buff_head, but skb->prev pointer is undefined. */ struct qdisc_skb_head { struct sk_buff *head; struct sk_buff *tail; __u32 qlen; spinlock_t lock; }; struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *sch); unsigned int flags; #define TCQ_F_BUILTIN 1 #define TCQ_F_INGRESS 2 #define TCQ_F_CAN_BYPASS 4 #define TCQ_F_MQROOT 8 #define TCQ_F_ONETXQUEUE 0x10 /* dequeue_skb() can assume all skbs are for * q->dev_queue : It can test * netif_xmit_frozen_or_stopped() before * dequeueing next packet. * Its true for MQ/MQPRIO slaves, or non * multiqueue device. */ #define TCQ_F_WARN_NONWC (1 << 16) #define TCQ_F_CPUSTATS 0x20 /* run using percpu statistics */ #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; struct hlist_node hash; u32 handle; u32 parent; struct netdev_queue *dev_queue; struct net_rate_estimator __rcu *rate_est; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; int pad; refcount_t refcnt; /* * For performance sake on SMP, we put highly modified fields at the end */ struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; struct rcu_head rcu; netdevice_tracker dev_tracker; /* private data */ long privdata[] ____cacheline_aligned; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return; refcount_inc(&qdisc->refcnt); } static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return true; return refcount_dec_if_one(&qdisc->refcnt); } /* Intended to be used by unlocked users, when concurrent qdisc release is * possible. */ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_BUILTIN) return qdisc; if (refcount_inc_not_zero(&qdisc->refcnt)) return qdisc; return NULL; } /* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc * root_lock section, or provide their own memory barriers -- ordering * against qdisc_run_begin/end() atomic bit operations. */ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) { return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY); } static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) { return q->flags & TCQ_F_CPUSTATS; } static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) return nolock_qdisc_is_empty(qdisc); return !READ_ONCE(qdisc->q.qlen); } /* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with * the qdisc root lock acquired. */ static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { if (spin_trylock(&qdisc->seqlock)) return true; /* No need to insist if the MISSED flag was already set. * Note that test_and_set_bit() also gives us memory ordering * guarantees wrt potential earlier enqueue() and below * spin_trylock(), both of which are necessary to prevent races */ if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state)) return false; /* Try to take the lock again to make sure that we will either * grab it or the CPU that still has it will see MISSED set * when testing it in qdisc_run_end() */ return spin_trylock(&qdisc->seqlock); } return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } static inline void qdisc_run_end(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); /* spin_unlock() only has store-release semantic. The unlock * and test_bit() ordering is a store-load ordering, so a full * memory barrier is needed here. */ smp_mb(); if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); } else { __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); } } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) { return qdisc->flags & TCQ_F_ONETXQUEUE; } static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq) { #ifdef CONFIG_BQL /* Non-BQL migrated drivers will return 0, too. */ return dql_avail(&txq->dql); #else return 0; #endif } struct Qdisc_class_ops { unsigned int flags; /* Child qdisc manipulation */ struct netdev_queue * (*select_queue)(struct Qdisc *, struct tcmsg *); int (*graft)(struct Qdisc *, unsigned long cl, struct Qdisc *, struct Qdisc **, struct netlink_ext_ack *extack); struct Qdisc * (*leaf)(struct Qdisc *, unsigned long cl); void (*qlen_notify)(struct Qdisc *, unsigned long); /* Class manipulation routines */ unsigned long (*find)(struct Qdisc *, u32 classid); int (*change)(struct Qdisc *, u32, u32, struct nlattr **, unsigned long *, struct netlink_ext_ack *); int (*delete)(struct Qdisc *, unsigned long, struct netlink_ext_ack *); void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ struct tcf_block * (*tcf_block)(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); void (*unbind_tcf)(struct Qdisc *, unsigned long); /* rtnetlink specific */ int (*dump)(struct Qdisc *, unsigned long, struct sk_buff *skb, struct tcmsg*); int (*dump_stats)(struct Qdisc *, unsigned long, struct gnet_dump *); }; /* Qdisc_class_ops flag values */ /* Implements API that doesn't require rtnl lock */ enum qdisc_class_ops_flags { QDISC_CLASS_OPS_DOIT_UNLOCKED = 1, }; struct Qdisc_ops { struct Qdisc_ops *next; const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free); struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); int (*init)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack); void (*reset)(struct Qdisc *); void (*destroy)(struct Qdisc *); int (*change)(struct Qdisc *sch, struct nlattr *arg, struct netlink_ext_ack *extack); void (*attach)(struct Qdisc *sch); int (*change_tx_queue_len)(struct Qdisc *, unsigned int); void (*change_real_num_tx)(struct Qdisc *sch, unsigned int new_real_tx); int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); void (*ingress_block_set)(struct Qdisc *sch, u32 block_index); void (*egress_block_set)(struct Qdisc *sch, u32 block_index); u32 (*ingress_block_get)(struct Qdisc *sch); u32 (*egress_block_get)(struct Qdisc *sch); struct module *owner; }; struct tcf_result { union { struct { unsigned long class; u32 classid; }; const struct tcf_proto *goto_tp; }; }; struct tcf_chain; struct tcf_proto_ops { struct list_head head; char kind[IFNAMSIZ]; int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); int (*init)(struct tcf_proto*); void (*destroy)(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack); void* (*get)(struct tcf_proto*, u32 handle); void (*put)(struct tcf_proto *tp, void *f); int (*change)(struct net *net, struct sk_buff *, struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, void **, u32, struct netlink_ext_ack *); int (*delete)(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *); bool (*delete_empty)(struct tcf_proto *tp); void (*walk)(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held); int (*reoffload)(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*hw_add)(struct tcf_proto *tp, void *type_data); void (*hw_del)(struct tcf_proto *tp, void *type_data); void (*bind_class)(void *, u32, unsigned long, void *, unsigned long); void * (*tmplt_create)(struct net *net, struct tcf_chain *chain, struct nlattr **tca, struct netlink_ext_ack *extack); void (*tmplt_destroy)(void *tmplt_priv); struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, u32 handle); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*, bool); int (*terse_dump)(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held); int (*tmplt_dump)(struct sk_buff *skb, struct net *net, void *tmplt_priv); struct module *owner; int flags; }; /* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags * are expected to implement tcf_proto_ops->delete_empty(), otherwise race * conditions can occur when filters are inserted/deleted simultaneously. */ enum tcf_proto_ops_flags { TCF_PROTO_OPS_DOIT_UNLOCKED = 1, }; struct tcf_proto { /* Fast access part */ struct tcf_proto __rcu *next; void __rcu *root; /* called under RCU BH lock*/ int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); __be16 protocol; /* All the rest */ u32 prio; void *data; const struct tcf_proto_ops *ops; struct tcf_chain *chain; /* Lock protects tcf_proto shared state and can be used by unlocked * classifiers to protect their private data. */ spinlock_t lock; bool deleting; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; }; struct qdisc_skb_cb { struct { unsigned int pkt_len; u16 slave_dev_queue_mapping; u16 tc_classid; }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { /* Protects filter_chain. */ struct mutex filter_chain_lock; struct tcf_proto __rcu *filter_chain; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; unsigned int action_refcnt; bool explicitly_created; bool flushing; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; struct rcu_head rcu; }; struct tcf_block { struct xarray ports; /* datapath accessible */ /* Lock protects tcf_block and lifetime-management data of chains * attached to the block (refcnt, action_refcnt, explicitly_created). */ struct mutex lock; struct list_head chain_list; u32 index; /* block index for shared blocks */ u32 classid; /* which class this block belongs to */ refcount_t refcnt; struct net *net; struct Qdisc *q; struct rw_semaphore cb_lock; /* protects cb_list and offload counters */ struct flow_block flow_block; struct list_head owner_list; bool keep_dst; atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ struct { struct tcf_chain *chain; struct list_head filter_chain_list; } chain0; struct rcu_head rcu; DECLARE_HASHTABLE(proto_destroy_ht, 7); struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index); static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain) { return lockdep_is_held(&chain->filter_chain_lock); } static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp) { return lockdep_is_held(&tp->lock); } #define tcf_chain_dereference(p, chain) \ rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain)) #define tcf_proto_dereference(p, tp) \ rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp)) static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb)); BUILD_BUG_ON(sizeof(qcb->data) < sz); } static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; } static inline int qdisc_qlen_sum(const struct Qdisc *q) { __u32 qlen = q->qstats.qlen; int i; if (qdisc_is_percpu_stats(q)) { for_each_possible_cpu(i) qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; } else { qlen += q->q.qlen; } return qlen; } static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) { return (struct qdisc_skb_cb *)skb->cb; } static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc) { return &qdisc->q.lock; } static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc) { struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc); return q; } static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc) { return rcu_dereference_bh(qdisc->dev_queue->qdisc); } static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc) { return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping); } static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) { struct Qdisc *root = qdisc_root_sleeping(qdisc); ASSERT_RTNL(); return qdisc_lock(root); } static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) { return qdisc->dev_queue->dev; } static inline void sch_tree_lock(struct Qdisc *q) { if (q->flags & TCQ_F_MQROOT) spin_lock_bh(qdisc_lock(q)); else spin_lock_bh(qdisc_root_sleeping_lock(q)); } static inline void sch_tree_unlock(struct Qdisc *q) { if (q->flags & TCQ_F_MQROOT) spin_unlock_bh(qdisc_lock(q)); else spin_unlock_bh(qdisc_root_sleeping_lock(q)); } extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1]; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; static inline const struct Qdisc_ops * get_default_qdisc_ops(const struct net_device *dev, int ntx) { return ntx < dev->real_num_tx_queues ? default_qdisc_ops : &pfifo_fast_ops; } struct Qdisc_class_common { u32 classid; unsigned int filter_cnt; struct hlist_node hnode; }; struct Qdisc_class_hash { struct hlist_head *hash; unsigned int hashsize; unsigned int hashmask; unsigned int hashelems; }; static inline unsigned int qdisc_class_hash(u32 id, u32 mask) { id ^= id >> 8; id ^= id >> 4; return id & mask; } static inline struct Qdisc_class_common * qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) { struct Qdisc_class_common *cl; unsigned int h; if (!id) return NULL; h = qdisc_class_hash(id, hash->hashmask); hlist_for_each_entry(cl, &hash->hash[h], hnode) { if (cl->classid == id) return cl; } return NULL; } static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl) { return cl->filter_cnt > 0; } static inline void qdisc_class_get(struct Qdisc_class_common *cl) { unsigned int res; if (check_add_overflow(cl->filter_cnt, 1, &res)) WARN(1, "Qdisc class overflow"); cl->filter_cnt = res; } static inline void qdisc_class_put(struct Qdisc_class_common *cl) { unsigned int res; if (check_sub_overflow(cl->filter_cnt, 1, &res)) WARN(1, "Qdisc class underflow"); cl->filter_cnt = res; } static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) { u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL; } int qdisc_class_hash_init(struct Qdisc_class_hash *); void qdisc_class_hash_insert(struct Qdisc_class_hash *, struct Qdisc_class_common *); void qdisc_class_hash_remove(struct Qdisc_class_hash *, struct Qdisc_class_common *); void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); void qdisc_class_hash_destroy(struct Qdisc_class_hash *); int dev_qdisc_change_tx_queue_len(struct net_device *dev); void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx); void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); void dev_deactivate(struct net_device *dev); void dev_deactivate_many(struct list_head *head); struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); void qdisc_destroy(struct Qdisc *qdisc); void qdisc_put(struct Qdisc *qdisc); void qdisc_put_unlocked(struct Qdisc *qdisc); void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len); #ifdef CONFIG_NET_SCHED int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data); void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack); #else static inline int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data) { q->flags &= ~TCQ_F_OFFLOADED; return 0; } static inline void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack) { } #endif void qdisc_offload_query_caps(struct net_device *dev, enum tc_setup_type type, void *caps, size_t caps_len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack); void qdisc_free(struct Qdisc *qdisc); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, u32 parentid, struct netlink_ext_ack *extack); void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); int skb_do_redirect(struct sk_buff *); static inline bool skb_at_tc_ingress(const struct sk_buff *skb) { #ifdef CONFIG_NET_XGRESS return skb->tc_at_ingress; #else return false; #endif } static inline bool skb_skip_tc_classify(struct sk_buff *skb) { #ifdef CONFIG_NET_CLS_ACT if (skb->tc_skip_classify) { skb->tc_skip_classify = 0; return true; } #endif return false; } /* Reset all TX qdiscs greater than index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) { struct Qdisc *qdisc; for (; i < dev->num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } } } /* Are all TX queues of the device empty? */ static inline bool qdisc_all_tx_empty(const struct net_device *dev) { unsigned int i; rcu_read_lock(); for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); const struct Qdisc *q = rcu_dereference(txq->qdisc); if (!qdisc_is_empty(q)) { rcu_read_unlock(); return false; } } rcu_read_unlock(); return true; } /* Are any of the TX qdiscs changing? */ static inline bool qdisc_tx_changing(const struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (rcu_access_pointer(txq->qdisc) != rcu_access_pointer(txq->qdisc_sleeping)) return true; } return false; } /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); if (rcu_access_pointer(txq->qdisc) != &noop_qdisc) return false; } return true; } static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) { return qdisc_skb_cb(skb)->pkt_len; } /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, __NET_XMIT_BYPASS = 0x00020000, }; #ifdef CONFIG_NET_CLS_ACT #define net_xmit_drop_count(e) ((e) & __NET_XMIT_STOLEN ? 0 : 1) #else #define net_xmit_drop_count(e) (1) #endif static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, const struct Qdisc *sch) { #ifdef CONFIG_NET_SCHED struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab); if (stab) __qdisc_calculate_pkt_len(skb, stab); #endif } static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { qdisc_calculate_pkt_len(skb, sch); return sch->enqueue(skb, sch, to_free); } static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, __u64 bytes, __u32 packets) { u64_stats_update_begin(&bstats->syncp); u64_stats_add(&bstats->bytes, bytes); u64_stats_add(&bstats->packets, packets); u64_stats_update_end(&bstats->syncp); } static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { _bstats_update(bstats, qdisc_pkt_len(skb), skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, const struct sk_buff *skb) { bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); } static inline void qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) { bstats_update(&sch->bstats, skb); } static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog -= qdisc_pkt_len(skb); } static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch, const struct sk_buff *skb) { this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog += qdisc_pkt_len(skb); } static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) { this_cpu_dec(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->requeues); } static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) { sch->qstats.drops += count; } static inline void qstats_drop_inc(struct gnet_stats_queue *qstats) { qstats->drops++; } static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats) { qstats->overlimits++; } static inline void qdisc_qstats_drop(struct Qdisc *sch) { qstats_drop_inc(&sch->qstats); } static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch) { this_cpu_inc(sch->cpu_qstats->drops); } static inline void qdisc_qstats_overlimit(struct Qdisc *sch) { sch->qstats.overlimits++; } static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch) { __u32 qlen = qdisc_qlen_sum(sch); return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen); } static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, __u32 *backlog) { struct gnet_stats_queue qstats = { 0 }; gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); *qlen = qstats.qlen + qdisc_qlen(sch); *backlog = qstats.backlog; } static inline void qdisc_tree_flush_backlog(struct Qdisc *sch) { __u32 qlen, backlog; qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); qdisc_tree_reduce_backlog(sch, qlen, backlog); } static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); qdisc_reset(sch); qdisc_tree_reduce_backlog(sch, qlen, backlog); } static inline void __qdisc_enqueue_tail(struct sk_buff *skb, struct qdisc_skb_head *qh) { struct sk_buff *last = qh->tail; if (last) { skb->next = NULL; last->next = skb; qh->tail = skb; } else { qh->tail = skb; qh->head = skb; } qh->qlen++; } static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch) { __qdisc_enqueue_tail(skb, &sch->q); qdisc_qstats_backlog_inc(sch, skb); return NET_XMIT_SUCCESS; } static inline void __qdisc_enqueue_head(struct sk_buff *skb, struct qdisc_skb_head *qh) { skb->next = qh->head; if (!qh->head) qh->tail = skb; qh->head = skb; qh->qlen++; } static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) { struct sk_buff *skb = qh->head; if (likely(skb != NULL)) { qh->head = skb->next; qh->qlen--; if (qh->head == NULL) qh->tail = NULL; skb->next = NULL; } return skb; } static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); if (likely(skb != NULL)) { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); } return skb; } struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; u16 zone; /* Only valid if post_ct = true */ u16 mru; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) { struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); return cb; } static inline enum skb_drop_reason tcf_get_drop_reason(const struct sk_buff *skb) { return tc_skb_cb(skb)->drop_reason; } static inline void tcf_set_drop_reason(const struct sk_buff *skb, enum skb_drop_reason reason) { tc_skb_cb(skb)->drop_reason = reason; } /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free) { skb->next = *to_free; *to_free = skb; } static inline void __qdisc_drop_all(struct sk_buff *skb, struct sk_buff **to_free) { if (skb->prev) skb->prev->next = *to_free; else skb->next = *to_free; *to_free = skb; } static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, struct qdisc_skb_head *qh, struct sk_buff **to_free) { struct sk_buff *skb = __qdisc_dequeue_head(qh); if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); __qdisc_drop(skb, to_free); return len; } return 0; } static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) { const struct qdisc_skb_head *qh = &sch->q; return qh->head; } /* generic pseudo peek method for non-work-conserving qdisc */ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) { struct sk_buff *skb = skb_peek(&sch->gso_skb); /* we can reuse ->gso_skb because peek isn't called for root qdiscs */ if (!skb) { skb = sch->dequeue(sch); if (skb) { __skb_queue_head(&sch->gso_skb, skb); /* it's still part of the queue */ qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; } } return skb; } static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch, struct sk_buff *skb) { if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_bstats_cpu_update(sch, skb); qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); sch->q.qlen--; } } static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch, unsigned int pkt_len) { if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_qlen_inc(sch); this_cpu_add(sch->cpu_qstats->backlog, pkt_len); } else { sch->qstats.backlog += pkt_len; sch->q.qlen++; } } /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { struct sk_buff *skb = skb_peek(&sch->gso_skb); if (skb) { skb = __skb_dequeue(&sch->gso_skb); if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } } else { skb = sch->dequeue(sch); } return skb; } static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh) { /* * We do not know the backlog in bytes of this list, it * is up to the caller to correct it */ ASSERT_RTNL(); if (qh->qlen) { rtnl_kfree_skbs(qh->head, qh->tail); qh->head = NULL; qh->tail = NULL; qh->qlen = 0; } } static inline void qdisc_reset_queue(struct Qdisc *sch) { __qdisc_reset_queue(&sch->q); } static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, struct Qdisc **pold) { struct Qdisc *old; sch_tree_lock(sch); old = *pold; *pold = new; if (old != NULL) qdisc_purge_queue(old); sch_tree_unlock(sch); return old; } static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) { rtnl_kfree_skbs(skb, skb); qdisc_qstats_drop(sch); } static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); qdisc_qstats_cpu_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; } static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { __qdisc_drop_all(skb, to_free); qdisc_qstats_drop(sch); return NET_XMIT_DROP; } struct psched_ratecfg { u64 rate_bytes_ps; /* bytes per second */ u32 mult; u16 overhead; u16 mpu; u8 linklayer; u8 shift; }; static inline u64 psched_l2t_ns(const struct psched_ratecfg *r, unsigned int len) { len += r->overhead; if (len < r->mpu) len = r->mpu; if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift; return ((u64)len * r->mult) >> r->shift; } void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64); static inline void psched_ratecfg_getrate(struct tc_ratespec *res, const struct psched_ratecfg *r) { memset(res, 0, sizeof(*res)); /* legacy struct tc_ratespec has a 32bit @rate field * Qdisc using 64bit rate should add new attributes * in order to maintain compatibility. */ res->rate = min_t(u64, r->rate_bytes_ps, ~0U); res->overhead = r->overhead; res->mpu = r->mpu; res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } struct psched_pktrate { u64 rate_pkts_ps; /* packets per second */ u32 mult; u8 shift; }; static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r, unsigned int pkt_num) { return ((u64)pkt_num * r->mult) >> r->shift; } void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. * The fast path only needs to access filter list and to update stats */ struct mini_Qdisc { struct tcf_proto *filter_list; struct tcf_block *block; struct gnet_stats_basic_sync __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; unsigned long rcu_state; }; static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, const struct sk_buff *skb) { bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); } static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) { this_cpu_inc(miniq->cpu_qstats->drops); } struct mini_Qdisc_pair { struct mini_Qdisc miniq1; struct mini_Qdisc miniq2; struct mini_Qdisc __rcu **p_miniq; }; void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block); void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx); int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); /* Make sure qdisc is no longer in SCHED state. */ static inline void qdisc_synchronize(const struct Qdisc *q) { while (test_bit(__QDISC_STATE_SCHED, &q->state)) msleep(1); } #endif |
| 123 17 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IPv6 * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. */ /* * Fixes: * * Ralf Baechle : generic ipv6 checksum * <ralf@waldorf-gmbh.de> */ #ifndef _CHECKSUM_IPV6_H #define _CHECKSUM_IPV6_H #include <asm/types.h> #include <asm/byteorder.h> #include <net/ip.h> #include <asm/checksum.h> #include <linux/in6.h> #include <linux/tcp.h> #include <linux/ipv6.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum); #endif static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) { return ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, proto, 0)); } static __inline__ __sum16 tcp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline void __tcp_v6_send_check(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct tcphdr *th = tcp_hdr(skb); th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); ipv6h->payload_len = 0; th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0); } static inline __sum16 udp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len); int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto); #endif |
| 1 1 1 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 11 11 11 11 11 11 3 3 3 3 3 3 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 Intel Corp. * Copyright (c) 2001-2002 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * These functions interface with the sockets layer to implement the * SCTP Extensions for the Sockets API. * * Note that the descriptions from the specification are USER level * functions--this file is the functions which populate the struct proto * for SCTP which is the BOTTOM of the sockets interface. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Narasimha Budihal <narsi@refcode.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <samudrala@us.ibm.com> * Inaky Perez-Gonzalez <inaky.gonzalez@intel.com> * Ardelle Fan <ardelle.fan@intel.com> * Ryan Layer <rmlayer@us.ibm.com> * Anup Pemmaiah <pemmaiah@cc.usu.edu> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/time.h> #include <linux/sched/signal.h> #include <linux/ip.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/busy_poll.h> #include <trace/events/sock.h> #include <linux/socket.h> /* for sa_family_t */ #include <linux/export.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); static void sctp_wfree(struct sk_buff *skb); static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); static void sctp_wait_for_close(struct sock *sk, long timeo); static void sctp_destruct_sock(struct sock *sk); static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, union sctp_addr *addr, int len); static int sctp_bindx_add(struct sock *, struct sockaddr *, int); static int sctp_bindx_rem(struct sock *, struct sockaddr *, int); static int sctp_send_asconf_add_ip(struct sock *, struct sockaddr *, int); static int sctp_send_asconf_del_ip(struct sock *, struct sockaddr *, int); static int sctp_send_asconf(struct sctp_association *asoc, struct sctp_chunk *chunk); static int sctp_do_bind(struct sock *, union sctp_addr *, int); static int sctp_autobind(struct sock *sk); static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sctp_association *assoc, enum sctp_socket_type type); static unsigned long sctp_memory_pressure; static atomic_long_t sctp_memory_allocated; static DEFINE_PER_CPU(int, sctp_memory_per_cpu_fw_alloc); struct percpu_counter sctp_sockets_allocated; static void sctp_enter_memory_pressure(struct sock *sk) { WRITE_ONCE(sctp_memory_pressure, 1); } /* Get the sndbuf space available at the time on the association. */ static inline int sctp_wspace(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; return asoc->ep->sndbuf_policy ? sk->sk_sndbuf - asoc->sndbuf_used : sk_stream_wspace(sk); } /* Increment the used sndbuf space count of the corresponding association by * the size of the outgoing data chunk. * Also, set the skb destructor for sndbuf accounting later. * * Since it is always 1-1 between chunk and skb, and also a new skb is always * allocated for chunk bundling in sctp_packet_transmit(), we can use the * destructor in the data chunk skb for the purpose of the sndbuf space * tracking. */ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) { struct sctp_association *asoc = chunk->asoc; struct sock *sk = asoc->base.sk; /* The sndbuf space is tracked per association. */ sctp_association_hold(asoc); if (chunk->shkey) sctp_auth_shkey_hold(chunk->shkey); skb_set_owner_w(chunk->skb, sk); chunk->skb->destructor = sctp_wfree; /* Save the chunk pointer in skb for sctp_wfree to use later. */ skb_shinfo(chunk->skb)->destructor_arg = chunk; refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc); asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk); sk_wmem_queued_add(sk, chunk->skb->truesize + sizeof(struct sctp_chunk)); sk_mem_charge(sk, chunk->skb->truesize); } static void sctp_clear_owner_w(struct sctp_chunk *chunk) { skb_orphan(chunk->skb); } #define traverse_and_process() \ do { \ msg = chunk->msg; \ if (msg == prev_msg) \ continue; \ list_for_each_entry(c, &msg->chunks, frag_list) { \ if ((clear && asoc->base.sk == c->skb->sk) || \ (!clear && asoc->base.sk != c->skb->sk)) \ cb(c); \ } \ prev_msg = msg; \ } while (0) static void sctp_for_each_tx_datachunk(struct sctp_association *asoc, bool clear, void (*cb)(struct sctp_chunk *)) { struct sctp_datamsg *msg, *prev_msg = NULL; struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chunk, *c; struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) list_for_each_entry(chunk, &t->transmitted, transmitted_list) traverse_and_process(); list_for_each_entry(chunk, &q->retransmit, transmitted_list) traverse_and_process(); list_for_each_entry(chunk, &q->sacked, transmitted_list) traverse_and_process(); list_for_each_entry(chunk, &q->abandoned, transmitted_list) traverse_and_process(); list_for_each_entry(chunk, &q->out_chunk_list, list) traverse_and_process(); } static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk, void (*cb)(struct sk_buff *, struct sock *)) { struct sk_buff *skb, *tmp; sctp_skb_for_each(skb, &asoc->ulpq.lobby, tmp) cb(skb, sk); sctp_skb_for_each(skb, &asoc->ulpq.reasm, tmp) cb(skb, sk); sctp_skb_for_each(skb, &asoc->ulpq.reasm_uo, tmp) cb(skb, sk); } /* Verify that this is a valid address. */ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, int len) { struct sctp_af *af; /* Verify basic sockaddr. */ af = sctp_sockaddr_af(sctp_sk(sk), addr, len); if (!af) return -EINVAL; /* Is this a valid SCTP address? */ if (!af->addr_valid(addr, sctp_sk(sk), NULL)) return -EINVAL; if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) return -EINVAL; return 0; } /* Look up the association by its id. If this is not a UDP-style * socket, the ID field is always ignored. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) { struct sctp_association *asoc = NULL; /* If this is not a UDP-style socket, assoc id should be ignored. */ if (!sctp_style(sk, UDP)) { /* Return NULL if the socket state is not ESTABLISHED. It * could be a TCP-style listening socket or a socket which * hasn't yet called connect() to establish an association. */ if (!sctp_sstate(sk, ESTABLISHED) && !sctp_sstate(sk, CLOSING)) return NULL; /* Get the first and the only association from the list. */ if (!list_empty(&sctp_sk(sk)->ep->asocs)) asoc = list_entry(sctp_sk(sk)->ep->asocs.next, struct sctp_association, asocs); return asoc; } /* Otherwise this is a UDP-style socket. */ if (id <= SCTP_ALL_ASSOC) return NULL; spin_lock_bh(&sctp_assocs_id_lock); asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id); if (asoc && (asoc->base.sk != sk || asoc->base.dead)) asoc = NULL; spin_unlock_bh(&sctp_assocs_id_lock); return asoc; } /* Look up the transport from an address and an assoc id. If both address and * id are specified, the associations matching the address and the id should be * the same. */ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, struct sockaddr_storage *addr, sctp_assoc_t id) { struct sctp_association *addr_asoc = NULL, *id_asoc = NULL; struct sctp_af *af = sctp_get_af_specific(addr->ss_family); union sctp_addr *laddr = (union sctp_addr *)addr; struct sctp_transport *transport; if (!af || sctp_verify_addr(sk, laddr, af->sockaddr_len)) return NULL; addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep, laddr, &transport); if (!addr_asoc) return NULL; id_asoc = sctp_id2assoc(sk, id); if (id_asoc && (id_asoc != addr_asoc)) return NULL; sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk), (union sctp_addr *)addr); return transport; } /* API 3.1.2 bind() - UDP Style Syntax * The syntax of bind() is, * * ret = bind(int sd, struct sockaddr *addr, int addrlen); * * sd - the socket descriptor returned by socket(). * addr - the address structure (struct sockaddr_in or struct * sockaddr_in6 [RFC 2553]), * addr_len - the size of the address structure. */ static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len) { int retval = 0; lock_sock(sk); pr_debug("%s: sk:%p, addr:%p, addr_len:%d\n", __func__, sk, addr, addr_len); /* Disallow binding twice. */ if (!sctp_sk(sk)->ep->base.bind_addr.port) retval = sctp_do_bind(sk, (union sctp_addr *)addr, addr_len); else retval = -EINVAL; release_sock(sk); return retval; } static int sctp_get_port_local(struct sock *, union sctp_addr *); /* Verify this is a valid sockaddr. */ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, union sctp_addr *addr, int len) { struct sctp_af *af; /* Check minimum size. */ if (len < sizeof (struct sockaddr)) return NULL; if (!opt->pf->af_supported(addr->sa.sa_family, opt)) return NULL; if (addr->sa.sa_family == AF_INET6) { if (len < SIN6_LEN_RFC2133) return NULL; /* V4 mapped address are really of AF_INET family */ if (ipv6_addr_v4mapped(&addr->v6.sin6_addr) && !opt->pf->af_supported(AF_INET, opt)) return NULL; } /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); if (len < af->sockaddr_len) return NULL; return af; } static void sctp_auto_asconf_init(struct sctp_sock *sp) { struct net *net = sock_net(&sp->inet.sk); if (net->sctp.default_auto_asconf) { spin_lock_bh(&net->sctp.addr_wq_lock); list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist); spin_unlock_bh(&net->sctp.addr_wq_lock); sp->do_auto_asconf = 1; } } /* Bind a local address either to an endpoint or to an association. */ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) { struct net *net = sock_net(sk); struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; struct sctp_bind_addr *bp = &ep->base.bind_addr; struct sctp_af *af; unsigned short snum; int ret = 0; /* Common sockaddr verification. */ af = sctp_sockaddr_af(sp, addr, len); if (!af) { pr_debug("%s: sk:%p, newaddr:%p, len:%d EINVAL\n", __func__, sk, addr, len); return -EINVAL; } snum = ntohs(addr->v4.sin_port); pr_debug("%s: sk:%p, new addr:%pISc, port:%d, new port:%d, len:%d\n", __func__, sk, &addr->sa, bp->port, snum, len); /* PF specific bind() address verification. */ if (!sp->pf->bind_verify(sp, addr)) return -EADDRNOTAVAIL; /* We must either be unbound, or bind to the same port. * It's OK to allow 0 ports if we are already bound. * We'll just inhert an already bound port in this case */ if (bp->port) { if (!snum) snum = bp->port; else if (snum != bp->port) { pr_debug("%s: new port %d doesn't match existing port " "%d\n", __func__, snum, bp->port); return -EINVAL; } } if (snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; /* See if the address matches any of the addresses we may have * already bound before checking against other endpoints. */ if (sctp_bind_addr_match(bp, addr, sp)) return -EINVAL; /* Make sure we are allowed to bind here. * The function sctp_get_port_local() does duplicate address * detection. */ addr->v4.sin_port = htons(snum); if (sctp_get_port_local(sk, addr)) return -EADDRINUSE; /* Refresh ephemeral port. */ if (!bp->port) { bp->port = inet_sk(sk)->inet_num; sctp_auto_asconf_init(sp); } /* Add the address to the bind address list. * Use GFP_ATOMIC since BHs will be disabled. */ ret = sctp_add_bind_addr(bp, addr, af->sockaddr_len, SCTP_ADDR_SRC, GFP_ATOMIC); if (ret) { sctp_put_port(sk); return ret; } /* Copy back into socket for getsockname() use. */ inet_sk(sk)->inet_sport = htons(inet_sk(sk)->inet_num); sp->pf->to_sk_saddr(addr, sk); return ret; } /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks * * R1) One and only one ASCONF Chunk MAY be in transit and unacknowledged * at any one time. If a sender, after sending an ASCONF chunk, decides * it needs to transfer another ASCONF Chunk, it MUST wait until the * ASCONF-ACK Chunk returns from the previous ASCONF Chunk before sending a * subsequent ASCONF. Note this restriction binds each side, so at any * time two ASCONF may be in-transit on any given association (one sent * from each endpoint). */ static int sctp_send_asconf(struct sctp_association *asoc, struct sctp_chunk *chunk) { int retval = 0; /* If there is an outstanding ASCONF chunk, queue it for later * transmission. */ if (asoc->addip_last_asconf) { list_add_tail(&chunk->list, &asoc->addip_chunk_list); goto out; } /* Hold the chunk until an ASCONF_ACK is received. */ sctp_chunk_hold(chunk); retval = sctp_primitive_ASCONF(asoc->base.net, asoc, chunk); if (retval) sctp_chunk_free(chunk); else asoc->addip_last_asconf = chunk; out: return retval; } /* Add a list of addresses as bind addresses to local endpoint or * association. * * Basically run through each address specified in the addrs/addrcnt * array/length pair, determine if it is IPv6 or IPv4 and call * sctp_do_bind() on it. * * If any of them fails, then the operation will be reversed and the * ones that were added will be removed. * * Only sctp_setsockopt_bindx() is supposed to call this function. */ static int sctp_bindx_add(struct sock *sk, struct sockaddr *addrs, int addrcnt) { int cnt; int retval = 0; void *addr_buf; struct sockaddr *sa_addr; struct sctp_af *af; pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); addr_buf = addrs; for (cnt = 0; cnt < addrcnt; cnt++) { /* The list may contain either IPv4 or IPv6 address; * determine the address length for walking thru the list. */ sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa_family); if (!af) { retval = -EINVAL; goto err_bindx_add; } retval = sctp_do_bind(sk, (union sctp_addr *)sa_addr, af->sockaddr_len); addr_buf += af->sockaddr_len; err_bindx_add: if (retval < 0) { /* Failed. Cleanup the ones that have been added */ if (cnt > 0) sctp_bindx_rem(sk, addrs, cnt); return retval; } } return retval; } /* Send an ASCONF chunk with Add IP address parameters to all the peers of the * associations that are part of the endpoint indicating that a list of local * addresses are added to the endpoint. * * If any of the addresses is already in the bind address list of the * association, we do not send the chunk for that association. But it will not * affect other associations. * * Only sctp_setsockopt_bindx() is supposed to call this function. */ static int sctp_send_asconf_add_ip(struct sock *sk, struct sockaddr *addrs, int addrcnt) { struct sctp_sock *sp; struct sctp_endpoint *ep; struct sctp_association *asoc; struct sctp_bind_addr *bp; struct sctp_chunk *chunk; struct sctp_sockaddr_entry *laddr; union sctp_addr *addr; union sctp_addr saveaddr; void *addr_buf; struct sctp_af *af; struct list_head *p; int i; int retval = 0; sp = sctp_sk(sk); ep = sp->ep; if (!ep->asconf_enable) return retval; pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); list_for_each_entry(asoc, &ep->asocs, asocs) { if (!asoc->peer.asconf_capable) continue; if (asoc->peer.addip_disabled_mask & SCTP_PARAM_ADD_IP) continue; if (!sctp_state(asoc, ESTABLISHED)) continue; /* Check if any address in the packed array of addresses is * in the bind address list of the association. If so, * do not send the asconf chunk to its peer, but continue with * other associations. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); if (!af) { retval = -EINVAL; goto out; } if (sctp_assoc_lookup_laddr(asoc, addr)) break; addr_buf += af->sockaddr_len; } if (i < addrcnt) continue; /* Use the first valid address in bind addr list of * association as Address Parameter of ASCONF CHUNK. */ bp = &asoc->base.bind_addr; p = bp->address_list.next; laddr = list_entry(p, struct sctp_sockaddr_entry, list); chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs, addrcnt, SCTP_PARAM_ADD_IP); if (!chunk) { retval = -ENOMEM; goto out; } /* Add the new addresses to the bind address list with * use_as_src set to 0. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); memcpy(&saveaddr, addr, af->sockaddr_len); retval = sctp_add_bind_addr(bp, &saveaddr, sizeof(saveaddr), SCTP_ADDR_NEW, GFP_ATOMIC); addr_buf += af->sockaddr_len; } if (asoc->src_out_of_asoc_ok) { struct sctp_transport *trans; list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); trans->ssthresh = asoc->peer.i.a_rwnd; trans->rto = asoc->rto_initial; sctp_max_rto(asoc, trans); trans->rtt = trans->srtt = trans->rttvar = 0; /* Clear the source and route cache */ sctp_transport_route(trans, NULL, sctp_sk(asoc->base.sk)); } } retval = sctp_send_asconf(asoc, chunk); } out: return retval; } /* Remove a list of addresses from bind addresses list. Do not remove the * last address. * * Basically run through each address specified in the addrs/addrcnt * array/length pair, determine if it is IPv6 or IPv4 and call * sctp_del_bind() on it. * * If any of them fails, then the operation will be reversed and the * ones that were removed will be added back. * * At least one address has to be left; if only one address is * available, the operation will return -EBUSY. * * Only sctp_setsockopt_bindx() is supposed to call this function. */ static int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; int cnt; struct sctp_bind_addr *bp = &ep->base.bind_addr; int retval = 0; void *addr_buf; union sctp_addr *sa_addr; struct sctp_af *af; pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); addr_buf = addrs; for (cnt = 0; cnt < addrcnt; cnt++) { /* If the bind address list is empty or if there is only one * bind address, there is nothing more to be removed (we need * at least one address here). */ if (list_empty(&bp->address_list) || (sctp_list_single_entry(&bp->address_list))) { retval = -EBUSY; goto err_bindx_rem; } sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa.sa_family); if (!af) { retval = -EINVAL; goto err_bindx_rem; } if (!af->addr_valid(sa_addr, sp, NULL)) { retval = -EADDRNOTAVAIL; goto err_bindx_rem; } if (sa_addr->v4.sin_port && sa_addr->v4.sin_port != htons(bp->port)) { retval = -EINVAL; goto err_bindx_rem; } if (!sa_addr->v4.sin_port) sa_addr->v4.sin_port = htons(bp->port); /* FIXME - There is probably a need to check if sk->sk_saddr and * sk->sk_rcv_addr are currently set to one of the addresses to * be removed. This is something which needs to be looked into * when we are fixing the outstanding issues with multi-homing * socket routing and failover schemes. Refer to comments in * sctp_do_bind(). -daisy */ retval = sctp_del_bind_addr(bp, sa_addr); addr_buf += af->sockaddr_len; err_bindx_rem: if (retval < 0) { /* Failed. Add the ones that has been removed back */ if (cnt > 0) sctp_bindx_add(sk, addrs, cnt); return retval; } } return retval; } /* Send an ASCONF chunk with Delete IP address parameters to all the peers of * the associations that are part of the endpoint indicating that a list of * local addresses are removed from the endpoint. * * If any of the addresses is already in the bind address list of the * association, we do not send the chunk for that association. But it will not * affect other associations. * * Only sctp_setsockopt_bindx() is supposed to call this function. */ static int sctp_send_asconf_del_ip(struct sock *sk, struct sockaddr *addrs, int addrcnt) { struct sctp_sock *sp; struct sctp_endpoint *ep; struct sctp_association *asoc; struct sctp_transport *transport; struct sctp_bind_addr *bp; struct sctp_chunk *chunk; union sctp_addr *laddr; void *addr_buf; struct sctp_af *af; struct sctp_sockaddr_entry *saddr; int i; int retval = 0; int stored = 0; chunk = NULL; sp = sctp_sk(sk); ep = sp->ep; if (!ep->asconf_enable) return retval; pr_debug("%s: sk:%p, addrs:%p, addrcnt:%d\n", __func__, sk, addrs, addrcnt); list_for_each_entry(asoc, &ep->asocs, asocs) { if (!asoc->peer.asconf_capable) continue; if (asoc->peer.addip_disabled_mask & SCTP_PARAM_DEL_IP) continue; if (!sctp_state(asoc, ESTABLISHED)) continue; /* Check if any address in the packed array of addresses is * not present in the bind address list of the association. * If so, do not send the asconf chunk to its peer, but * continue with other associations. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { laddr = addr_buf; af = sctp_get_af_specific(laddr->v4.sin_family); if (!af) { retval = -EINVAL; goto out; } if (!sctp_assoc_lookup_laddr(asoc, laddr)) break; addr_buf += af->sockaddr_len; } if (i < addrcnt) continue; /* Find one address in the association's bind address list * that is not in the packed array of addresses. This is to * make sure that we do not delete all the addresses in the * association. */ bp = &asoc->base.bind_addr; laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs, addrcnt, sp); if ((laddr == NULL) && (addrcnt == 1)) { if (asoc->asconf_addr_del_pending) continue; asoc->asconf_addr_del_pending = kzalloc(sizeof(union sctp_addr), GFP_ATOMIC); if (asoc->asconf_addr_del_pending == NULL) { retval = -ENOMEM; goto out; } asoc->asconf_addr_del_pending->sa.sa_family = addrs->sa_family; asoc->asconf_addr_del_pending->v4.sin_port = htons(bp->port); if (addrs->sa_family == AF_INET) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addrs; asoc->asconf_addr_del_pending->v4.sin_addr.s_addr = sin->sin_addr.s_addr; } else if (addrs->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addrs; asoc->asconf_addr_del_pending->v6.sin6_addr = sin6->sin6_addr; } pr_debug("%s: keep the last address asoc:%p %pISc at %p\n", __func__, asoc, &asoc->asconf_addr_del_pending->sa, asoc->asconf_addr_del_pending); asoc->src_out_of_asoc_ok = 1; stored = 1; goto skip_mkasconf; } if (laddr == NULL) return -EINVAL; /* We do not need RCU protection throughout this loop * because this is done under a socket lock from the * setsockopt call. */ chunk = sctp_make_asconf_update_ip(asoc, laddr, addrs, addrcnt, SCTP_PARAM_DEL_IP); if (!chunk) { retval = -ENOMEM; goto out; } skip_mkasconf: /* Reset use_as_src flag for the addresses in the bind address * list that are to be deleted. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { laddr = addr_buf; af = sctp_get_af_specific(laddr->v4.sin_family); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, laddr)) saddr->state = SCTP_ADDR_DEL; } addr_buf += af->sockaddr_len; } /* Update the route and saddr entries for all the transports * as some of the addresses in the bind address list are * about to be deleted and cannot be used as source addresses. */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_route(transport, NULL, sctp_sk(asoc->base.sk)); } if (stored) /* We don't need to transmit ASCONF */ continue; retval = sctp_send_asconf(asoc, chunk); } out: return retval; } /* set addr events to assocs in the endpoint. ep and addr_wq must be locked */ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) { struct sock *sk = sctp_opt2sk(sp); union sctp_addr *addr; struct sctp_af *af; /* It is safe to write port space in caller. */ addr = &addrw->a; addr->v4.sin_port = htons(sp->ep->base.bind_addr.port); af = sctp_get_af_specific(addr->sa.sa_family); if (!af) return -EINVAL; if (sctp_verify_addr(sk, addr, af->sockaddr_len)) return -EINVAL; if (addrw->state == SCTP_ADDR_NEW) return sctp_send_asconf_add_ip(sk, (struct sockaddr *)addr, 1); else return sctp_send_asconf_del_ip(sk, (struct sockaddr *)addr, 1); } /* Helper for tunneling sctp_bindx() requests through sctp_setsockopt() * * API 8.1 * int sctp_bindx(int sd, struct sockaddr *addrs, int addrcnt, * int flags); * * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses. * If the sd is an IPv6 socket, the addresses passed can either be IPv4 * or IPv6 addresses. * * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see * Section 3.1.2 for this usage. * * addrs is a pointer to an array of one or more socket addresses. Each * address is contained in its appropriate structure (i.e. struct * sockaddr_in or struct sockaddr_in6) the family of the address type * must be used to distinguish the address length (note that this * representation is termed a "packed array" of addresses). The caller * specifies the number of addresses in the array with addrcnt. * * On success, sctp_bindx() returns 0. On failure, sctp_bindx() returns * -1, and sets errno to the appropriate error code. * * For SCTP, the port given in each socket address must be the same, or * sctp_bindx() will fail, setting errno to EINVAL. * * The flags parameter is formed from the bitwise OR of zero or more of * the following currently defined flags: * * SCTP_BINDX_ADD_ADDR * * SCTP_BINDX_REM_ADDR * * SCTP_BINDX_ADD_ADDR directs SCTP to add the given addresses to the * association, and SCTP_BINDX_REM_ADDR directs SCTP to remove the given * addresses from the association. The two flags are mutually exclusive; * if both are given, sctp_bindx() will fail with EINVAL. A caller may * not remove all addresses from an association; sctp_bindx() will * reject such an attempt with EINVAL. * * An application can use sctp_bindx(SCTP_BINDX_ADD_ADDR) to associate * additional addresses with an endpoint after calling bind(). Or use * sctp_bindx(SCTP_BINDX_REM_ADDR) to remove some addresses a listening * socket is associated with so that no new association accepted will be * associated with those addresses. If the endpoint supports dynamic * address a SCTP_BINDX_REM_ADDR or SCTP_BINDX_ADD_ADDR may cause a * endpoint to send the appropriate message to the peer to change the * peers address lists. * * Adding and removing addresses from a connected association is * optional functionality. Implementations that do not support this * functionality should return EOPNOTSUPP. * * Basically do nothing but copying the addresses from user to kernel * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk. * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace. * * On exit there is no need to do sockfd_put(), sys_setsockopt() does * it. * * sk The sk of the socket * addrs The pointer to the addresses * addrssize Size of the addrs buffer * op Operation to perform (add or remove, see the flags of * sctp_bindx) * * Returns 0 if ok, <0 errno code on error. */ static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *addrs, int addrs_size, int op) { int err; int addrcnt = 0; int walk_size = 0; struct sockaddr *sa_addr; void *addr_buf = addrs; struct sctp_af *af; pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n", __func__, sk, addr_buf, addrs_size, op); if (unlikely(addrs_size <= 0)) return -EINVAL; /* Walk through the addrs buffer and count the number of addresses. */ while (walk_size < addrs_size) { if (walk_size + sizeof(sa_family_t) > addrs_size) return -EINVAL; sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa_family); /* If the address family is not supported or if this address * causes the address buffer to overflow return EINVAL. */ if (!af || (walk_size + af->sockaddr_len) > addrs_size) return -EINVAL; addrcnt++; addr_buf += af->sockaddr_len; walk_size += af->sockaddr_len; } /* Do the work. */ switch (op) { case SCTP_BINDX_ADD_ADDR: /* Allow security module to validate bindx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_BINDX_ADD, addrs, addrs_size); if (err) return err; err = sctp_bindx_add(sk, addrs, addrcnt); if (err) return err; return sctp_send_asconf_add_ip(sk, addrs, addrcnt); case SCTP_BINDX_REM_ADDR: err = sctp_bindx_rem(sk, addrs, addrcnt); if (err) return err; return sctp_send_asconf_del_ip(sk, addrs, addrcnt); default: return -EINVAL; } } static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, int addrlen) { int err; lock_sock(sk); err = sctp_setsockopt_bindx(sk, addrs, addrlen, SCTP_BINDX_ADD_ADDR); release_sock(sk); return err; } static int sctp_connect_new_asoc(struct sctp_endpoint *ep, const union sctp_addr *daddr, const struct sctp_initmsg *init, struct sctp_transport **tp) { struct sctp_association *asoc; struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); enum sctp_scope scope; int err; if (sctp_endpoint_is_peeled_off(ep, daddr)) return -EADDRNOTAVAIL; if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) return -EAGAIN; } else { if (inet_port_requires_bind_service(net, ep->base.bind_addr.port) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; } scope = sctp_scope(daddr); asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); if (!asoc) return -ENOMEM; err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL); if (err < 0) goto free; *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); if (!*tp) { err = -ENOMEM; goto free; } if (!init) return 0; if (init->sinit_num_ostreams) { __u16 outcnt = init->sinit_num_ostreams; asoc->c.sinit_num_ostreams = outcnt; /* outcnt has been changed, need to re-init stream */ err = sctp_stream_init(&asoc->stream, outcnt, 0, GFP_KERNEL); if (err) goto free; } if (init->sinit_max_instreams) asoc->c.sinit_max_instreams = init->sinit_max_instreams; if (init->sinit_max_attempts) asoc->max_init_attempts = init->sinit_max_attempts; if (init->sinit_max_init_timeo) asoc->max_init_timeo = msecs_to_jiffies(init->sinit_max_init_timeo); return 0; free: sctp_association_free(asoc); return err; } static int sctp_connect_add_peer(struct sctp_association *asoc, union sctp_addr *daddr, int addr_len) { struct sctp_endpoint *ep = asoc->ep; struct sctp_association *old; struct sctp_transport *t; int err; err = sctp_verify_addr(ep->base.sk, daddr, addr_len); if (err) return err; old = sctp_endpoint_lookup_assoc(ep, daddr, &t); if (old && old != asoc) return old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN : -EALREADY; if (sctp_endpoint_is_peeled_off(ep, daddr)) return -EADDRNOTAVAIL; t = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); if (!t) return -ENOMEM; return 0; } /* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size) * * Common routine for handling connect() and sctp_connectx(). * Connect will come in with just a single address. */ static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs, int addrs_size, int flags, sctp_assoc_t *assoc_id) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; struct sctp_transport *transport; struct sctp_association *asoc; void *addr_buf = kaddrs; union sctp_addr *daddr; struct sctp_af *af; int walk_size, err; long timeo; if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) || (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) return -EISCONN; daddr = addr_buf; af = sctp_get_af_specific(daddr->sa.sa_family); if (!af || af->sockaddr_len > addrs_size) return -EINVAL; err = sctp_verify_addr(sk, daddr, af->sockaddr_len); if (err) return err; asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); if (asoc) return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN : -EALREADY; err = sctp_connect_new_asoc(ep, daddr, NULL, &transport); if (err) return err; asoc = transport->asoc; addr_buf += af->sockaddr_len; walk_size = af->sockaddr_len; while (walk_size < addrs_size) { err = -EINVAL; if (walk_size + sizeof(sa_family_t) > addrs_size) goto out_free; daddr = addr_buf; af = sctp_get_af_specific(daddr->sa.sa_family); if (!af || af->sockaddr_len + walk_size > addrs_size) goto out_free; if (asoc->peer.port != ntohs(daddr->v4.sin_port)) goto out_free; err = sctp_connect_add_peer(asoc, daddr, af->sockaddr_len); if (err) goto out_free; addr_buf += af->sockaddr_len; walk_size += af->sockaddr_len; } /* In case the user of sctp_connectx() wants an association * id back, assign one now. */ if (assoc_id) { err = sctp_assoc_set_id(asoc, GFP_KERNEL); if (err < 0) goto out_free; } err = sctp_primitive_ASSOCIATE(sock_net(sk), asoc, NULL); if (err < 0) goto out_free; /* Initialize sk's dport and daddr for getpeername() */ inet_sk(sk)->inet_dport = htons(asoc->peer.port); sp->pf->to_sk_daddr(daddr, sk); sk->sk_err = 0; if (assoc_id) *assoc_id = asoc->assoc_id; timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); return sctp_wait_for_connect(asoc, &timeo); out_free: pr_debug("%s: took out_free path with asoc:%p kaddrs:%p err:%d\n", __func__, asoc, kaddrs, err); sctp_association_free(asoc); return err; } /* Helper for tunneling sctp_connectx() requests through sctp_setsockopt() * * API 8.9 * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt, * sctp_assoc_t *asoc); * * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses. * If the sd is an IPv6 socket, the addresses passed can either be IPv4 * or IPv6 addresses. * * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see * Section 3.1.2 for this usage. * * addrs is a pointer to an array of one or more socket addresses. Each * address is contained in its appropriate structure (i.e. struct * sockaddr_in or struct sockaddr_in6) the family of the address type * must be used to distengish the address length (note that this * representation is termed a "packed array" of addresses). The caller * specifies the number of addresses in the array with addrcnt. * * On success, sctp_connectx() returns 0. It also sets the assoc_id to * the association id of the new association. On failure, sctp_connectx() * returns -1, and sets errno to the appropriate error code. The assoc_id * is not touched by the kernel. * * For SCTP, the port given in each socket address must be the same, or * sctp_connectx() will fail, setting errno to EINVAL. * * An application can use sctp_connectx to initiate an association with * an endpoint that is multi-homed. Much like sctp_bindx() this call * allows a caller to specify multiple addresses at which a peer can be * reached. The way the SCTP stack uses the list of addresses to set up * the association is implementation dependent. This function only * specifies that the stack will try to make use of all the addresses in * the list when needed. * * Note that the list of addresses passed in is only used for setting up * the association. It does not necessarily equal the set of addresses * the peer uses for the resulting association. If the caller wants to * find out the set of peer addresses, it must use sctp_getpaddrs() to * retrieve them after the association has been set up. * * Basically do nothing but copying the addresses from user to kernel * land and invoking either sctp_connectx(). This is used for tunneling * the sctp_connectx() request through sctp_setsockopt() from userspace. * * On exit there is no need to do sockfd_put(), sys_setsockopt() does * it. * * sk The sk of the socket * addrs The pointer to the addresses * addrssize Size of the addrs buffer * * Returns >=0 if ok, <0 errno code on error. */ static int __sctp_setsockopt_connectx(struct sock *sk, struct sockaddr *kaddrs, int addrs_size, sctp_assoc_t *assoc_id) { int err = 0, flags = 0; pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", __func__, sk, kaddrs, addrs_size); /* make sure the 1st addr's sa_family is accessible later */ if (unlikely(addrs_size < sizeof(sa_family_t))) return -EINVAL; /* Allow security module to validate connectx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_CONNECTX, (struct sockaddr *)kaddrs, addrs_size); if (err) return err; /* in-kernel sockets don't generally have a file allocated to them * if all they do is call sock_create_kern(). */ if (sk->sk_socket->file) flags = sk->sk_socket->file->f_flags; return __sctp_connect(sk, kaddrs, addrs_size, flags, assoc_id); } /* * This is an older interface. It's kept for backward compatibility * to the option that doesn't provide association id. */ static int sctp_setsockopt_connectx_old(struct sock *sk, struct sockaddr *kaddrs, int addrs_size) { return __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, NULL); } /* * New interface for the API. The since the API is done with a socket * option, to make it simple we feed back the association id is as a return * indication to the call. Error is always negative and association id is * always positive. */ static int sctp_setsockopt_connectx(struct sock *sk, struct sockaddr *kaddrs, int addrs_size) { sctp_assoc_t assoc_id = 0; int err = 0; err = __sctp_setsockopt_connectx(sk, kaddrs, addrs_size, &assoc_id); if (err) return err; else return assoc_id; } /* * New (hopefully final) interface for the API. * We use the sctp_getaddrs_old structure so that use-space library * can avoid any unnecessary allocations. The only different part * is that we store the actual length of the address buffer into the * addrs_num structure member. That way we can re-use the existing * code. */ #ifdef CONFIG_COMPAT struct compat_sctp_getaddrs_old { sctp_assoc_t assoc_id; s32 addr_num; compat_uptr_t addrs; /* struct sockaddr * */ }; #endif static int sctp_getsockopt_connectx3(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_getaddrs_old param; sctp_assoc_t assoc_id = 0; struct sockaddr *kaddrs; int err = 0; #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_sctp_getaddrs_old param32; if (len < sizeof(param32)) return -EINVAL; if (copy_from_user(¶m32, optval, sizeof(param32))) return -EFAULT; param.assoc_id = param32.assoc_id; param.addr_num = param32.addr_num; param.addrs = compat_ptr(param32.addrs); } else #endif { if (len < sizeof(param)) return -EINVAL; if (copy_from_user(¶m, optval, sizeof(param))) return -EFAULT; } kaddrs = memdup_user(param.addrs, param.addr_num); if (IS_ERR(kaddrs)) return PTR_ERR(kaddrs); err = __sctp_setsockopt_connectx(sk, kaddrs, param.addr_num, &assoc_id); kfree(kaddrs); if (err == 0 || err == -EINPROGRESS) { if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) return -EFAULT; if (put_user(sizeof(assoc_id), optlen)) return -EFAULT; } return err; } /* API 3.1.4 close() - UDP Style Syntax * Applications use close() to perform graceful shutdown (as described in * Section 10.1 of [SCTP]) on ALL the associations currently represented * by a UDP-style socket. * * The syntax is * * ret = close(int sd); * * sd - the socket descriptor of the associations to be closed. * * To gracefully shutdown a specific association represented by the * UDP-style socket, an application should use the sendmsg() call, * passing no user data, but including the appropriate flag in the * ancillary data (see Section xxxx). * * If sd in the close() call is a branched-off socket representing only * one association, the shutdown is performed on that association only. * * 4.1.6 close() - TCP Style Syntax * * Applications use close() to gracefully close down an association. * * The syntax is: * * int close(int sd); * * sd - the socket descriptor of the association to be closed. * * After an application calls close() on a socket descriptor, no further * socket operations will succeed on that descriptor. * * API 7.1.4 SO_LINGER * * An application using the TCP-style socket can use this option to * perform the SCTP ABORT primitive. The linger option structure is: * * struct linger { * int l_onoff; // option on/off * int l_linger; // linger time * }; * * To enable the option, set l_onoff to 1. If the l_linger value is set * to 0, calling close() is the same as the ABORT primitive. If the * value is set to a negative value, the setsockopt() call will return * an error. If the value is set to a positive value linger_time, the * close() can be blocked for at most linger_time ms. If the graceful * shutdown phase does not finish during this period, close() will * return but the graceful shutdown phase continues in the system. */ static void sctp_close(struct sock *sk, long timeout) { struct net *net = sock_net(sk); struct sctp_endpoint *ep; struct sctp_association *asoc; struct list_head *pos, *temp; unsigned int data_was_unread; pr_debug("%s: sk:%p, timeout:%ld\n", __func__, sk, timeout); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); sk->sk_shutdown = SHUTDOWN_MASK; inet_sk_set_state(sk, SCTP_SS_CLOSING); ep = sctp_sk(sk)->ep; /* Clean up any skbs sitting on the receive queue. */ data_was_unread = sctp_queue_purge_ulpevents(&sk->sk_receive_queue); data_was_unread += sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby); /* Walk all associations on an endpoint. */ list_for_each_safe(pos, temp, &ep->asocs) { asoc = list_entry(pos, struct sctp_association, asocs); if (sctp_style(sk, TCP)) { /* A closed association can still be in the list if * it belongs to a TCP-style listening socket that is * not yet accepted. If so, free it. If not, send an * ABORT or SHUTDOWN based on the linger options. */ if (sctp_state(asoc, CLOSED)) { sctp_association_free(asoc); continue; } } if (data_was_unread || !skb_queue_empty(&asoc->ulpq.lobby) || !skb_queue_empty(&asoc->ulpq.reasm) || !skb_queue_empty(&asoc->ulpq.reasm_uo) || (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { struct sctp_chunk *chunk; chunk = sctp_make_abort_user(asoc, NULL, 0); sctp_primitive_ABORT(net, asoc, chunk); } else sctp_primitive_SHUTDOWN(net, asoc, NULL); } /* On a TCP-style socket, block for at most linger_time if set. */ if (sctp_style(sk, TCP) && timeout) sctp_wait_for_close(sk, timeout); /* This will run the backlog queue. */ release_sock(sk); /* Supposedly, no process has access to the socket, but * the net layers still may. * Also, sctp_destroy_sock() needs to be called with addr_wq_lock * held and that should be grabbed before socket lock. */ spin_lock_bh(&net->sctp.addr_wq_lock); bh_lock_sock_nested(sk); /* Hold the sock, since sk_common_release() will put sock_put() * and we have just a little more cleanup. */ sock_hold(sk); sk_common_release(sk); bh_unlock_sock(sk); spin_unlock_bh(&net->sctp.addr_wq_lock); sock_put(sk); SCTP_DBG_OBJCNT_DEC(sock); } /* Handle EPIPE error. */ static int sctp_error(struct sock *sk, int flags, int err) { if (err == -EPIPE) err = sock_error(sk) ? : -EPIPE; if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); return err; } /* API 3.1.3 sendmsg() - UDP Style Syntax * * An application uses sendmsg() and recvmsg() calls to transmit data to * and receive data from its peer. * * ssize_t sendmsg(int socket, const struct msghdr *message, * int flags); * * socket - the socket descriptor of the endpoint. * message - pointer to the msghdr structure which contains a single * user message and possibly some ancillary data. * * See Section 5 for complete description of the data * structures. * * flags - flags sent or received with the user message, see Section * 5 for complete description of the flags. * * Note: This function could use a rewrite especially when explicit * connect support comes in. */ /* BUG: We do not implement the equivalent of sk_stream_wait_memory(). */ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs); static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs, struct sctp_sndrcvinfo *srinfo, const struct msghdr *msg, size_t msg_len) { __u16 sflags; int err; if (sctp_sstate(sk, LISTENING) && sctp_style(sk, TCP)) return -EPIPE; if (msg_len > sk->sk_sndbuf) return -EMSGSIZE; memset(cmsgs, 0, sizeof(*cmsgs)); err = sctp_msghdr_parse(msg, cmsgs); if (err) { pr_debug("%s: msghdr parse err:%x\n", __func__, err); return err; } memset(srinfo, 0, sizeof(*srinfo)); if (cmsgs->srinfo) { srinfo->sinfo_stream = cmsgs->srinfo->sinfo_stream; srinfo->sinfo_flags = cmsgs->srinfo->sinfo_flags; srinfo->sinfo_ppid = cmsgs->srinfo->sinfo_ppid; srinfo->sinfo_context = cmsgs->srinfo->sinfo_context; srinfo->sinfo_assoc_id = cmsgs->srinfo->sinfo_assoc_id; srinfo->sinfo_timetolive = cmsgs->srinfo->sinfo_timetolive; } if (cmsgs->sinfo) { srinfo->sinfo_stream = cmsgs->sinfo->snd_sid; srinfo->sinfo_flags = cmsgs->sinfo->snd_flags; srinfo->sinfo_ppid = cmsgs->sinfo->snd_ppid; srinfo->sinfo_context = cmsgs->sinfo->snd_context; srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id; } if (cmsgs->prinfo) { srinfo->sinfo_timetolive = cmsgs->prinfo->pr_value; SCTP_PR_SET_POLICY(srinfo->sinfo_flags, cmsgs->prinfo->pr_policy); } sflags = srinfo->sinfo_flags; if (!sflags && msg_len) return 0; if (sctp_style(sk, TCP) && (sflags & (SCTP_EOF | SCTP_ABORT))) return -EINVAL; if (((sflags & SCTP_EOF) && msg_len > 0) || (!(sflags & (SCTP_EOF | SCTP_ABORT)) && msg_len == 0)) return -EINVAL; if ((sflags & SCTP_ADDR_OVER) && !msg->msg_name) return -EINVAL; return 0; } static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct sctp_cmsgs *cmsgs, union sctp_addr *daddr, struct sctp_transport **tp) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; struct cmsghdr *cmsg; __be32 flowinfo = 0; struct sctp_af *af; int err; *tp = NULL; if (sflags & (SCTP_EOF | SCTP_ABORT)) return -EINVAL; if (sctp_style(sk, TCP) && (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING))) return -EADDRNOTAVAIL; /* Label connection socket for first association 1-to-many * style for client sequence socket()->sendmsg(). This * needs to be done before sctp_assoc_add_peer() as that will * set up the initial packet that needs to account for any * security ip options (CIPSO/CALIPSO) added to the packet. */ af = sctp_get_af_specific(daddr->sa.sa_family); if (!af) return -EINVAL; err = security_sctp_bind_connect(sk, SCTP_SENDMSG_CONNECT, (struct sockaddr *)daddr, af->sockaddr_len); if (err < 0) return err; err = sctp_connect_new_asoc(ep, daddr, cmsgs->init, tp); if (err) return err; asoc = (*tp)->asoc; if (!cmsgs->addrs_msg) return 0; if (daddr->sa.sa_family == AF_INET6) flowinfo = daddr->v6.sin6_flowinfo; /* sendv addr list parse */ for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { union sctp_addr _daddr; int dlen; if (cmsg->cmsg_level != IPPROTO_SCTP || (cmsg->cmsg_type != SCTP_DSTADDRV4 && cmsg->cmsg_type != SCTP_DSTADDRV6)) continue; daddr = &_daddr; memset(daddr, 0, sizeof(*daddr)); dlen = cmsg->cmsg_len - sizeof(struct cmsghdr); if (cmsg->cmsg_type == SCTP_DSTADDRV4) { if (dlen < sizeof(struct in_addr)) { err = -EINVAL; goto free; } dlen = sizeof(struct in_addr); daddr->v4.sin_family = AF_INET; daddr->v4.sin_port = htons(asoc->peer.port); memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen); } else { if (dlen < sizeof(struct in6_addr)) { err = -EINVAL; goto free; } dlen = sizeof(struct in6_addr); daddr->v6.sin6_flowinfo = flowinfo; daddr->v6.sin6_family = AF_INET6; daddr->v6.sin6_port = htons(asoc->peer.port); memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); } err = sctp_connect_add_peer(asoc, daddr, sizeof(*daddr)); if (err) goto free; } return 0; free: sctp_association_free(asoc); return err; } static int sctp_sendmsg_check_sflags(struct sctp_association *asoc, __u16 sflags, struct msghdr *msg, size_t msg_len) { struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) return -EPIPE; if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP) && !sctp_state(asoc, ESTABLISHED)) return 0; if (sflags & SCTP_EOF) { pr_debug("%s: shutting down association:%p\n", __func__, asoc); sctp_primitive_SHUTDOWN(net, asoc, NULL); return 0; } if (sflags & SCTP_ABORT) { struct sctp_chunk *chunk; chunk = sctp_make_abort_user(asoc, msg, msg_len); if (!chunk) return -ENOMEM; pr_debug("%s: aborting association:%p\n", __func__, asoc); sctp_primitive_ABORT(net, asoc, chunk); iov_iter_revert(&msg->msg_iter, msg_len); return 0; } return 1; } static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, struct msghdr *msg, size_t msg_len, struct sctp_transport *transport, struct sctp_sndrcvinfo *sinfo) { struct sock *sk = asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct net *net = sock_net(sk); struct sctp_datamsg *datamsg; bool wait_connect = false; struct sctp_chunk *chunk; long timeo; int err; if (sinfo->sinfo_stream >= asoc->stream.outcnt) { err = -EINVAL; goto err; } if (unlikely(!SCTP_SO(&asoc->stream, sinfo->sinfo_stream)->ext)) { err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); if (err) goto err; } if (sp->disable_fragments && msg_len > asoc->frag_point) { err = -EMSGSIZE; goto err; } if (asoc->pmtu_pending) { if (sp->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); asoc->pmtu_pending = 0; } if (sctp_wspace(asoc) < (int)msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) goto err; if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) { err = -EINVAL; goto err; } } if (sctp_state(asoc, CLOSED)) { err = sctp_primitive_ASSOCIATE(net, asoc, NULL); if (err) goto err; if (asoc->ep->intl_enable) { timeo = sock_sndtimeo(sk, 0); err = sctp_wait_for_connect(asoc, &timeo); if (err) { err = -ESRCH; goto err; } } else { wait_connect = true; } pr_debug("%s: we associated primitively\n", __func__); } datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter); if (IS_ERR(datamsg)) { err = PTR_ERR(datamsg); goto err; } asoc->force_delay = !!(msg->msg_flags & MSG_MORE); list_for_each_entry(chunk, &datamsg->chunks, frag_list) { sctp_chunk_hold(chunk); sctp_set_owner_w(chunk); chunk->transport = transport; } err = sctp_primitive_SEND(net, asoc, datamsg); if (err) { sctp_datamsg_free(datamsg); goto err; } pr_debug("%s: we sent primitively\n", __func__); sctp_datamsg_put(datamsg); if (unlikely(wait_connect)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); sctp_wait_for_connect(asoc, &timeo); } err = msg_len; err: return err; } static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk, const struct msghdr *msg, struct sctp_cmsgs *cmsgs) { union sctp_addr *daddr = NULL; int err; if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) { int len = msg->msg_namelen; if (len > sizeof(*daddr)) len = sizeof(*daddr); daddr = (union sctp_addr *)msg->msg_name; err = sctp_verify_addr(sk, daddr, len); if (err) return ERR_PTR(err); } return daddr; } static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct sctp_cmsgs *cmsgs) { if (!cmsgs->srinfo && !cmsgs->sinfo) { sinfo->sinfo_stream = asoc->default_stream; sinfo->sinfo_ppid = asoc->default_ppid; sinfo->sinfo_context = asoc->default_context; sinfo->sinfo_assoc_id = sctp_assoc2id(asoc); if (!cmsgs->prinfo) sinfo->sinfo_flags = asoc->default_flags; } if (!cmsgs->srinfo && !cmsgs->prinfo) sinfo->sinfo_timetolive = asoc->default_timetolive; if (cmsgs->authinfo) { /* Reuse sinfo_tsn to indicate that authinfo was set and * sinfo_ssn to save the keyid on tx path. */ sinfo->sinfo_tsn = 1; sinfo->sinfo_ssn = cmsgs->authinfo->auth_keynumber; } } static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_transport *transport = NULL; struct sctp_sndrcvinfo _sinfo, *sinfo; struct sctp_association *asoc, *tmp; struct sctp_cmsgs cmsgs; union sctp_addr *daddr; bool new = false; __u16 sflags; int err; /* Parse and get snd_info */ err = sctp_sendmsg_parse(sk, &cmsgs, &_sinfo, msg, msg_len); if (err) goto out; sinfo = &_sinfo; sflags = sinfo->sinfo_flags; /* Get daddr from msg */ daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs); if (IS_ERR(daddr)) { err = PTR_ERR(daddr); goto out; } lock_sock(sk); /* SCTP_SENDALL process */ if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) { list_for_each_entry_safe(asoc, tmp, &ep->asocs, asocs) { err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err == 0) continue; if (err < 0) goto out_unlock; sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, NULL, sinfo); if (err < 0) goto out_unlock; iov_iter_revert(&msg->msg_iter, err); } goto out_unlock; } /* Get and check or create asoc */ if (daddr) { asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); if (asoc) { err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err <= 0) goto out_unlock; } else { err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr, &transport); if (err) goto out_unlock; asoc = transport->asoc; new = true; } if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER)) transport = NULL; } else { asoc = sctp_id2assoc(sk, sinfo->sinfo_assoc_id); if (!asoc) { err = -EPIPE; goto out_unlock; } err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err <= 0) goto out_unlock; } /* Update snd_info with the asoc */ sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); /* Send msg to the asoc */ err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, transport, sinfo); if (err < 0 && err != -ESRCH && new) sctp_association_free(asoc); out_unlock: release_sock(sk); out: return sctp_error(sk, msg->msg_flags, err); } /* This is an extended version of skb_pull() that removes the data from the * start of a skb even when data is spread across the list of skb's in the * frag_list. len specifies the total amount of data that needs to be removed. * when 'len' bytes could be removed from the skb, it returns 0. * If 'len' exceeds the total skb length, it returns the no. of bytes that * could not be removed. */ static int sctp_skb_pull(struct sk_buff *skb, int len) { struct sk_buff *list; int skb_len = skb_headlen(skb); int rlen; if (len <= skb_len) { __skb_pull(skb, len); return 0; } len -= skb_len; __skb_pull(skb, skb_len); skb_walk_frags(skb, list) { rlen = sctp_skb_pull(list, len); skb->len -= (len-rlen); skb->data_len -= (len-rlen); if (!rlen) return 0; len = rlen; } return len; } /* API 3.1.3 recvmsg() - UDP Style Syntax * * ssize_t recvmsg(int socket, struct msghdr *message, * int flags); * * socket - the socket descriptor of the endpoint. * message - pointer to the msghdr structure which contains a single * user message and possibly some ancillary data. * * See Section 5 for complete description of the data * structures. * * flags - flags sent or received with the user message, see Section * 5 for complete description of the flags. */ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sctp_ulpevent *event = NULL; struct sctp_sock *sp = sctp_sk(sk); struct sk_buff *skb, *head_skb; int copied; int err = 0; int skb_len; pr_debug("%s: sk:%p, msghdr:%p, len:%zd, flags:0x%x, addr_len:%p)\n", __func__, sk, msg, len, flags, addr_len); if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, flags & MSG_DONTWAIT); lock_sock(sk); if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED) && !sctp_sstate(sk, CLOSING) && !sctp_sstate(sk, CLOSED)) { err = -ENOTCONN; goto out; } skb = sctp_skb_recv_datagram(sk, flags, &err); if (!skb) goto out; /* Get the total length of the skb including any skb's in the * frag_list. */ skb_len = skb->len; copied = skb_len; if (copied > len) copied = len; err = skb_copy_datagram_msg(skb, 0, msg, copied); event = sctp_skb2event(skb); if (err) goto out_free; if (event->chunk && event->chunk->head_skb) head_skb = event->chunk->head_skb; else head_skb = skb; sock_recv_cmsgs(msg, sk, head_skb); if (sctp_ulpevent_is_notification(event)) { msg->msg_flags |= MSG_NOTIFICATION; sp->pf->event_msgname(event, msg->msg_name, addr_len); } else { sp->pf->skb_msgname(head_skb, msg->msg_name, addr_len); } /* Check if we allow SCTP_NXTINFO. */ if (sp->recvnxtinfo) sctp_ulpevent_read_nxtinfo(event, msg, sk); /* Check if we allow SCTP_RCVINFO. */ if (sp->recvrcvinfo) sctp_ulpevent_read_rcvinfo(event, msg); /* Check if we allow SCTP_SNDRCVINFO. */ if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_DATA_IO_EVENT)) sctp_ulpevent_read_sndrcvinfo(event, msg); err = copied; /* If skb's length exceeds the user's buffer, update the skb and * push it back to the receive_queue so that the next call to * recvmsg() will return the remaining data. Don't set MSG_EOR. */ if (skb_len > copied) { msg->msg_flags &= ~MSG_EOR; if (flags & MSG_PEEK) goto out_free; sctp_skb_pull(skb, copied); skb_queue_head(&sk->sk_receive_queue, skb); /* When only partial message is copied to the user, increase * rwnd by that amount. If all the data in the skb is read, * rwnd is updated when the event is freed. */ if (!sctp_ulpevent_is_notification(event)) sctp_assoc_rwnd_increase(event->asoc, copied); goto out; } else if ((event->msg_flags & MSG_NOTIFICATION) || (event->msg_flags & MSG_EOR)) msg->msg_flags |= MSG_EOR; else msg->msg_flags &= ~MSG_EOR; out_free: if (flags & MSG_PEEK) { /* Release the skb reference acquired after peeking the skb in * sctp_skb_recv_datagram(). */ kfree_skb(skb); } else { /* Free the event which includes releasing the reference to * the owner of the skb, freeing the skb and updating the * rwnd. */ sctp_ulpevent_free(event); } out: release_sock(sk); return err; } /* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) * * This option is a on/off flag. If enabled no SCTP message * fragmentation will be performed. Instead if a message being sent * exceeds the current PMTU size, the message will NOT be sent and * instead a error will be indicated to the user. */ static int sctp_setsockopt_disable_fragments(struct sock *sk, int *val, unsigned int optlen) { if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->disable_fragments = (*val == 0) ? 0 : 1; return 0; } static int sctp_setsockopt_events(struct sock *sk, __u8 *sn_type, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int i; if (optlen > sizeof(struct sctp_event_subscribe)) return -EINVAL; for (i = 0; i < optlen; i++) sctp_ulpevent_type_set(&sp->subscribe, SCTP_SN_TYPE_BASE + i, sn_type[i]); list_for_each_entry(asoc, &sp->ep->asocs, asocs) asoc->subscribe = sctp_sk(sk)->subscribe; /* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT, * if there is no data to be sent or retransmit, the stack will * immediately send up this notification. */ if (sctp_ulpevent_type_enabled(sp->subscribe, SCTP_SENDER_DRY_EVENT)) { struct sctp_ulpevent *event; asoc = sctp_id2assoc(sk, 0); if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_USER | __GFP_NOWARN); if (!event) return -ENOMEM; asoc->stream.si->enqueue_event(&asoc->ulpq, event); } } return 0; } /* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE) * * This socket option is applicable to the UDP-style socket only. When * set it will cause associations that are idle for more than the * specified number of seconds to automatically close. An association * being idle is defined an association that has NOT sent or received * user data. The special value of '0' indicates that no automatic * close of any associations should be performed. The option expects an * integer defining the number of seconds of idle time before an * association is closed. */ static int sctp_setsockopt_autoclose(struct sock *sk, u32 *optval, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct net *net = sock_net(sk); /* Applicable to UDP-style socket only */ if (sctp_style(sk, TCP)) return -EOPNOTSUPP; if (optlen != sizeof(int)) return -EINVAL; sp->autoclose = *optval; if (sp->autoclose > net->sctp.max_autoclose) sp->autoclose = net->sctp.max_autoclose; return 0; } /* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS) * * Applications can enable or disable heartbeats for any peer address of * an association, modify an address's heartbeat interval, force a * heartbeat to be sent immediately, and adjust the address's maximum * number of retransmissions sent before an address is considered * unreachable. The following structure is used to access and modify an * address's parameters: * * struct sctp_paddrparams { * sctp_assoc_t spp_assoc_id; * struct sockaddr_storage spp_address; * uint32_t spp_hbinterval; * uint16_t spp_pathmaxrxt; * uint32_t spp_pathmtu; * uint32_t spp_sackdelay; * uint32_t spp_flags; * uint32_t spp_ipv6_flowlabel; * uint8_t spp_dscp; * }; * * spp_assoc_id - (one-to-many style socket) This is filled in the * application, and identifies the association for * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, * in milliseconds. If a value of zero * is present in this field then no changes are to * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be * considered unreachable. If a value of zero * is present in this field then no changes are to * be made to this parameter. * spp_pathmtu - When Path MTU discovery is disabled the value * specified here will be the "fixed" path mtu. * Note that if the spp_address field is empty * then all associations on this address will * have this fixed path mtu set upon them. * * spp_sackdelay - When delayed sack is enabled, this value specifies * the number of milliseconds that sacks will be delayed * for. This value will apply to all addresses of an * association if the spp_address field is empty. Note * also, that if delayed sack is enabled and this * value is set to 0, no change is made to the last * recorded delayed sack timer value. * * spp_flags - These flags are used to control various features * on an association. The flag field may contain * zero or more of the following options. * * SPP_HB_ENABLE - Enable heartbeats on the * specified address. Note that if the address * field is empty all addresses for the association * have heartbeats enabled upon them. * * SPP_HB_DISABLE - Disable heartbeats on the * speicifed address. Note that if the address * field is empty all addresses for the association * will have their heartbeats disabled. Note also * that SPP_HB_ENABLE and SPP_HB_DISABLE are * mutually exclusive, only one of these two should * be specified. Enabling both fields will have * undetermined results. * * SPP_HB_DEMAND - Request a user initiated heartbeat * to be made immediately. * * SPP_HB_TIME_IS_ZERO - Specify's that the time for * heartbeat delayis to be set to the value of 0 * milliseconds. * * SPP_PMTUD_ENABLE - This field will enable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses * on the association are effected. * * SPP_PMTUD_DISABLE - This field will disable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses * on the association are effected. Not also that * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually * exclusive. Enabling both will have undetermined * results. * * SPP_SACKDELAY_ENABLE - Setting this flag turns * on delayed sack. The time specified in spp_sackdelay * is used to specify the sack delay for this address. Note * that if spp_address is empty then all addresses will * enable delayed sack and take on the sack delay * value specified in spp_sackdelay. * SPP_SACKDELAY_DISABLE - Setting this flag turns * off delayed sack. If the spp_address field is blank then * delayed sack is disabled for the entire association. Note * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. * * SPP_IPV6_FLOWLABEL: Setting this flag enables the * setting of the IPV6 flow label value. The value is * contained in the spp_ipv6_flowlabel field. * Upon retrieval, this flag will be set to indicate that * the spp_ipv6_flowlabel field has a valid value returned. * If a specific destination address is set (in the * spp_address field), then the value returned is that of * the address. If just an association is specified (and * no address), then the association's default flow label * is returned. If neither an association nor a destination * is specified, then the socket's default flow label is * returned. For non-IPv6 sockets, this flag will be left * cleared. * * SPP_DSCP: Setting this flag enables the setting of the * Differentiated Services Code Point (DSCP) value * associated with either the association or a specific * address. The value is obtained in the spp_dscp field. * Upon retrieval, this flag will be set to indicate that * the spp_dscp field has a valid value returned. If a * specific destination address is set when called (in the * spp_address field), then that specific destination * address's DSCP value is returned. If just an association * is specified, then the association's default DSCP is * returned. If neither an association nor a destination is * specified, then the socket's default DSCP is returned. * * spp_ipv6_flowlabel * - This field is used in conjunction with the * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. * The 20 least significant bits are used for the flow * label. This setting has precedence over any IPv6-layer * setting. * * spp_dscp - This field is used in conjunction with the SPP_DSCP flag * and contains the DSCP. The 6 most significant bits are * used for the DSCP. This setting has precedence over any * IPv4- or IPv6- layer setting. */ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, struct sctp_transport *trans, struct sctp_association *asoc, struct sctp_sock *sp, int hb_change, int pmtud_change, int sackdelay_change) { int error; if (params->spp_flags & SPP_HB_DEMAND && trans) { error = sctp_primitive_REQUESTHEARTBEAT(trans->asoc->base.net, trans->asoc, trans); if (error) return error; } /* Note that unless the spp_flag is set to SPP_HB_ENABLE the value of * this field is ignored. Note also that a value of zero indicates * the current setting should be left unchanged. */ if (params->spp_flags & SPP_HB_ENABLE) { /* Re-zero the interval if the SPP_HB_TIME_IS_ZERO is * set. This lets us use 0 value when this flag * is set. */ if (params->spp_flags & SPP_HB_TIME_IS_ZERO) params->spp_hbinterval = 0; if (params->spp_hbinterval || (params->spp_flags & SPP_HB_TIME_IS_ZERO)) { if (trans) { trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); sctp_transport_reset_hb_timer(trans); } else if (asoc) { asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); } else { sp->hbinterval = params->spp_hbinterval; } } } if (hb_change) { if (trans) { trans->param_flags = (trans->param_flags & ~SPP_HB) | hb_change; } else if (asoc) { asoc->param_flags = (asoc->param_flags & ~SPP_HB) | hb_change; } else { sp->param_flags = (sp->param_flags & ~SPP_HB) | hb_change; } } /* When Path MTU discovery is disabled the value specified here will * be the "fixed" path mtu (i.e. the value of the spp_flags field must * include the flag SPP_PMTUD_DISABLE for this field to have any * effect). */ if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) { if (trans) { trans->pathmtu = params->spp_pathmtu; sctp_assoc_sync_pmtu(asoc); } else if (asoc) { sctp_assoc_set_pmtu(asoc, params->spp_pathmtu); } else { sp->pathmtu = params->spp_pathmtu; } } if (pmtud_change) { if (trans) { int update = (trans->param_flags & SPP_PMTUD_DISABLE) && (params->spp_flags & SPP_PMTUD_ENABLE); trans->param_flags = (trans->param_flags & ~SPP_PMTUD) | pmtud_change; if (update) { sctp_transport_pmtu(trans, sctp_opt2sk(sp)); sctp_assoc_sync_pmtu(asoc); } sctp_transport_pl_reset(trans); } else if (asoc) { asoc->param_flags = (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; } else { sp->param_flags = (sp->param_flags & ~SPP_PMTUD) | pmtud_change; } } /* Note that unless the spp_flag is set to SPP_SACKDELAY_ENABLE the * value of this field is ignored. Note also that a value of zero * indicates the current setting should be left unchanged. */ if ((params->spp_flags & SPP_SACKDELAY_ENABLE) && params->spp_sackdelay) { if (trans) { trans->sackdelay = msecs_to_jiffies(params->spp_sackdelay); } else if (asoc) { asoc->sackdelay = msecs_to_jiffies(params->spp_sackdelay); } else { sp->sackdelay = params->spp_sackdelay; } } if (sackdelay_change) { if (trans) { trans->param_flags = (trans->param_flags & ~SPP_SACKDELAY) | sackdelay_change; } else if (asoc) { asoc->param_flags = (asoc->param_flags & ~SPP_SACKDELAY) | sackdelay_change; } else { sp->param_flags = (sp->param_flags & ~SPP_SACKDELAY) | sackdelay_change; } } /* Note that a value of zero indicates the current setting should be left unchanged. */ if (params->spp_pathmaxrxt) { if (trans) { trans->pathmaxrxt = params->spp_pathmaxrxt; } else if (asoc) { asoc->pathmaxrxt = params->spp_pathmaxrxt; } else { sp->pathmaxrxt = params->spp_pathmaxrxt; } } if (params->spp_flags & SPP_IPV6_FLOWLABEL) { if (trans) { if (trans->ipaddr.sa.sa_family == AF_INET6) { trans->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } } else if (asoc) { struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->ipaddr.sa.sa_family != AF_INET6) continue; t->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; t->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } asoc->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } else if (sctp_opt2sk(sp)->sk_family == AF_INET6) { sp->flowlabel = params->spp_ipv6_flowlabel & SCTP_FLOWLABEL_VAL_MASK; sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK; } } if (params->spp_flags & SPP_DSCP) { if (trans) { trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; trans->dscp |= SCTP_DSCP_SET_MASK; } else if (asoc) { struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { t->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; t->dscp |= SCTP_DSCP_SET_MASK; } asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; asoc->dscp |= SCTP_DSCP_SET_MASK; } else { sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK; sp->dscp |= SCTP_DSCP_SET_MASK; } } return 0; } static int sctp_setsockopt_peer_addr_params(struct sock *sk, struct sctp_paddrparams *params, unsigned int optlen) { struct sctp_transport *trans = NULL; struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); int error; int hb_change, pmtud_change, sackdelay_change; if (optlen == ALIGN(offsetof(struct sctp_paddrparams, spp_ipv6_flowlabel), 4)) { if (params->spp_flags & (SPP_DSCP | SPP_IPV6_FLOWLABEL)) return -EINVAL; } else if (optlen != sizeof(*params)) { return -EINVAL; } /* Validate flags and value parameters. */ hb_change = params->spp_flags & SPP_HB; pmtud_change = params->spp_flags & SPP_PMTUD; sackdelay_change = params->spp_flags & SPP_SACKDELAY; if (hb_change == SPP_HB || pmtud_change == SPP_PMTUD || sackdelay_change == SPP_SACKDELAY || params->spp_sackdelay > 500 || (params->spp_pathmtu && params->spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) return -EINVAL; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)¶ms->spp_address)) { trans = sctp_addr_id2transport(sk, ¶ms->spp_address, params->spp_assoc_id); if (!trans) return -EINVAL; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params->spp_assoc_id); if (!asoc && params->spp_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Heartbeat demand can only be sent on a transport or * association, but not a socket. */ if (params->spp_flags & SPP_HB_DEMAND && !trans && !asoc) return -EINVAL; /* Process parameters. */ error = sctp_apply_peer_addr_params(params, trans, asoc, sp, hb_change, pmtud_change, sackdelay_change); if (error) return error; /* If changes are for association, also apply parameters to each * transport. */ if (!trans && asoc) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { sctp_apply_peer_addr_params(params, trans, asoc, sp, hb_change, pmtud_change, sackdelay_change); } } return 0; } static inline __u32 sctp_spp_sackdelay_enable(__u32 param_flags) { return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_ENABLE; } static inline __u32 sctp_spp_sackdelay_disable(__u32 param_flags) { return (param_flags & ~SPP_SACKDELAY) | SPP_SACKDELAY_DISABLE; } static void sctp_apply_asoc_delayed_ack(struct sctp_sack_info *params, struct sctp_association *asoc) { struct sctp_transport *trans; if (params->sack_delay) { asoc->sackdelay = msecs_to_jiffies(params->sack_delay); asoc->param_flags = sctp_spp_sackdelay_enable(asoc->param_flags); } if (params->sack_freq == 1) { asoc->param_flags = sctp_spp_sackdelay_disable(asoc->param_flags); } else if (params->sack_freq > 1) { asoc->sackfreq = params->sack_freq; asoc->param_flags = sctp_spp_sackdelay_enable(asoc->param_flags); } list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { if (params->sack_delay) { trans->sackdelay = msecs_to_jiffies(params->sack_delay); trans->param_flags = sctp_spp_sackdelay_enable(trans->param_flags); } if (params->sack_freq == 1) { trans->param_flags = sctp_spp_sackdelay_disable(trans->param_flags); } else if (params->sack_freq > 1) { trans->sackfreq = params->sack_freq; trans->param_flags = sctp_spp_sackdelay_enable(trans->param_flags); } } } /* * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK) * * This option will effect the way delayed acks are performed. This * option allows you to get or set the delayed ack time, in * milliseconds. It also allows changing the delayed ack frequency. * Changing the frequency to 1 disables the delayed sack algorithm. If * the assoc_id is 0, then this sets or gets the endpoints default * values. If the assoc_id field is non-zero, then the set or get * effects the specified association for the one to many model (the * assoc_id field is ignored by the one to one model). Note that if * sack_delay or sack_freq are 0 when setting this option, then the * current values will remain unchanged. * * struct sctp_sack_info { * sctp_assoc_t sack_assoc_id; * uint32_t sack_delay; * uint32_t sack_freq; * }; * * sack_assoc_id - This parameter, indicates which association the user * is performing an action upon. Note that if this field's value is * zero then the endpoints default value is changed (effecting future * associations only). * * sack_delay - This parameter contains the number of milliseconds that * the user is requesting the delayed ACK timer be set to. Note that * this value is defined in the standard to be between 200 and 500 * milliseconds. * * sack_freq - This parameter contains the number of packets that must * be received before a sack is sent without waiting for the delay * timer to expire. The default value for this is 2, setting this * value to 1 will disable the delayed sack algorithm. */ static int __sctp_setsockopt_delayed_ack(struct sock *sk, struct sctp_sack_info *params) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; /* Validate value parameter. */ if (params->sack_delay > 500) return -EINVAL; /* Get association, if sack_assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params->sack_assoc_id); if (!asoc && params->sack_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { sctp_apply_asoc_delayed_ack(params, asoc); return 0; } if (sctp_style(sk, TCP)) params->sack_assoc_id = SCTP_FUTURE_ASSOC; if (params->sack_assoc_id == SCTP_FUTURE_ASSOC || params->sack_assoc_id == SCTP_ALL_ASSOC) { if (params->sack_delay) { sp->sackdelay = params->sack_delay; sp->param_flags = sctp_spp_sackdelay_enable(sp->param_flags); } if (params->sack_freq == 1) { sp->param_flags = sctp_spp_sackdelay_disable(sp->param_flags); } else if (params->sack_freq > 1) { sp->sackfreq = params->sack_freq; sp->param_flags = sctp_spp_sackdelay_enable(sp->param_flags); } } if (params->sack_assoc_id == SCTP_CURRENT_ASSOC || params->sack_assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) sctp_apply_asoc_delayed_ack(params, asoc); return 0; } static int sctp_setsockopt_delayed_ack(struct sock *sk, struct sctp_sack_info *params, unsigned int optlen) { if (optlen == sizeof(struct sctp_assoc_value)) { struct sctp_assoc_value *v = (struct sctp_assoc_value *)params; struct sctp_sack_info p; pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of struct sctp_assoc_value in delayed_ack socket option.\n" "Use struct sctp_sack_info instead\n", current->comm, task_pid_nr(current)); p.sack_assoc_id = v->assoc_id; p.sack_delay = v->assoc_value; p.sack_freq = v->assoc_value ? 0 : 1; return __sctp_setsockopt_delayed_ack(sk, &p); } if (optlen != sizeof(struct sctp_sack_info)) return -EINVAL; if (params->sack_delay == 0 && params->sack_freq == 0) return 0; return __sctp_setsockopt_delayed_ack(sk, params); } /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association * initialization. The option name argument to setsockopt() and getsockopt() * is SCTP_INITMSG. * * Setting initialization parameters is effective only on an unconnected * socket (for UDP-style sockets only future associations are effected * by the change). With TCP-style sockets, this option is inherited by * sockets derived from a listener socket. */ static int sctp_setsockopt_initmsg(struct sock *sk, struct sctp_initmsg *sinit, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); if (optlen != sizeof(struct sctp_initmsg)) return -EINVAL; if (sinit->sinit_num_ostreams) sp->initmsg.sinit_num_ostreams = sinit->sinit_num_ostreams; if (sinit->sinit_max_instreams) sp->initmsg.sinit_max_instreams = sinit->sinit_max_instreams; if (sinit->sinit_max_attempts) sp->initmsg.sinit_max_attempts = sinit->sinit_max_attempts; if (sinit->sinit_max_init_timeo) sp->initmsg.sinit_max_init_timeo = sinit->sinit_max_init_timeo; return 0; } /* * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM) * * Applications that wish to use the sendto() system call may wish to * specify a default set of parameters that would normally be supplied * through the inclusion of ancillary data. This socket option allows * such an application to set the default sctp_sndrcvinfo structure. * The application that wishes to use this socket option simply passes * in to this call the sctp_sndrcvinfo structure defined in Section * 5.2.2) The input parameters accepted by this call include * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context, * sinfo_timetolive. The user must provide the sinfo_assoc_id field in * to this call if the caller is using the UDP model. */ static int sctp_setsockopt_default_send_param(struct sock *sk, struct sctp_sndrcvinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; if (optlen != sizeof(*info)) return -EINVAL; if (info->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; asoc = sctp_id2assoc(sk, info->sinfo_assoc_id); if (!asoc && info->sinfo_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { asoc->default_stream = info->sinfo_stream; asoc->default_flags = info->sinfo_flags; asoc->default_ppid = info->sinfo_ppid; asoc->default_context = info->sinfo_context; asoc->default_timetolive = info->sinfo_timetolive; return 0; } if (sctp_style(sk, TCP)) info->sinfo_assoc_id = SCTP_FUTURE_ASSOC; if (info->sinfo_assoc_id == SCTP_FUTURE_ASSOC || info->sinfo_assoc_id == SCTP_ALL_ASSOC) { sp->default_stream = info->sinfo_stream; sp->default_flags = info->sinfo_flags; sp->default_ppid = info->sinfo_ppid; sp->default_context = info->sinfo_context; sp->default_timetolive = info->sinfo_timetolive; } if (info->sinfo_assoc_id == SCTP_CURRENT_ASSOC || info->sinfo_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { asoc->default_stream = info->sinfo_stream; asoc->default_flags = info->sinfo_flags; asoc->default_ppid = info->sinfo_ppid; asoc->default_context = info->sinfo_context; asoc->default_timetolive = info->sinfo_timetolive; } } return 0; } /* RFC6458, Section 8.1.31. Set/get Default Send Parameters * (SCTP_DEFAULT_SNDINFO) */ static int sctp_setsockopt_default_sndinfo(struct sock *sk, struct sctp_sndinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; if (optlen != sizeof(*info)) return -EINVAL; if (info->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_ABORT | SCTP_EOF)) return -EINVAL; asoc = sctp_id2assoc(sk, info->snd_assoc_id); if (!asoc && info->snd_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { asoc->default_stream = info->snd_sid; asoc->default_flags = info->snd_flags; asoc->default_ppid = info->snd_ppid; asoc->default_context = info->snd_context; return 0; } if (sctp_style(sk, TCP)) info->snd_assoc_id = SCTP_FUTURE_ASSOC; if (info->snd_assoc_id == SCTP_FUTURE_ASSOC || info->snd_assoc_id == SCTP_ALL_ASSOC) { sp->default_stream = info->snd_sid; sp->default_flags = info->snd_flags; sp->default_ppid = info->snd_ppid; sp->default_context = info->snd_context; } if (info->snd_assoc_id == SCTP_CURRENT_ASSOC || info->snd_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { asoc->default_stream = info->snd_sid; asoc->default_flags = info->snd_flags; asoc->default_ppid = info->snd_ppid; asoc->default_context = info->snd_context; } } return 0; } /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) * * Requests that the local SCTP stack use the enclosed peer address as * the association primary. The enclosed address must be one of the * association peer's addresses. */ static int sctp_setsockopt_primary_addr(struct sock *sk, struct sctp_prim *prim, unsigned int optlen) { struct sctp_transport *trans; struct sctp_af *af; int err; if (optlen != sizeof(struct sctp_prim)) return -EINVAL; /* Allow security module to validate address but need address len. */ af = sctp_get_af_specific(prim->ssp_addr.ss_family); if (!af) return -EINVAL; err = security_sctp_bind_connect(sk, SCTP_PRIMARY_ADDR, (struct sockaddr *)&prim->ssp_addr, af->sockaddr_len); if (err) return err; trans = sctp_addr_id2transport(sk, &prim->ssp_addr, prim->ssp_assoc_id); if (!trans) return -EINVAL; sctp_assoc_set_primary(trans->asoc, trans); return 0; } /* * 7.1.5 SCTP_NODELAY * * Turn on/off any Nagle-like algorithm. This means that packets are * generally sent as soon as possible and no unnecessary delays are * introduced, at the cost of more packets in the network. Expects an * integer boolean flag. */ static int sctp_setsockopt_nodelay(struct sock *sk, int *val, unsigned int optlen) { if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->nodelay = (*val == 0) ? 0 : 1; return 0; } /* * * 7.1.1 SCTP_RTOINFO * * The protocol parameters used to initialize and bound retransmission * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access * and modify these parameters. * All parameters are time values, in milliseconds. A value of 0, when * modifying the parameters, indicates that the current value should not * be changed. * */ static int sctp_setsockopt_rtoinfo(struct sock *sk, struct sctp_rtoinfo *rtoinfo, unsigned int optlen) { struct sctp_association *asoc; unsigned long rto_min, rto_max; struct sctp_sock *sp = sctp_sk(sk); if (optlen != sizeof (struct sctp_rtoinfo)) return -EINVAL; asoc = sctp_id2assoc(sk, rtoinfo->srto_assoc_id); /* Set the values to the specific association */ if (!asoc && rtoinfo->srto_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; rto_max = rtoinfo->srto_max; rto_min = rtoinfo->srto_min; if (rto_max) rto_max = asoc ? msecs_to_jiffies(rto_max) : rto_max; else rto_max = asoc ? asoc->rto_max : sp->rtoinfo.srto_max; if (rto_min) rto_min = asoc ? msecs_to_jiffies(rto_min) : rto_min; else rto_min = asoc ? asoc->rto_min : sp->rtoinfo.srto_min; if (rto_min > rto_max) return -EINVAL; if (asoc) { if (rtoinfo->srto_initial != 0) asoc->rto_initial = msecs_to_jiffies(rtoinfo->srto_initial); asoc->rto_max = rto_max; asoc->rto_min = rto_min; } else { /* If there is no association or the association-id = 0 * set the values to the endpoint. */ if (rtoinfo->srto_initial != 0) sp->rtoinfo.srto_initial = rtoinfo->srto_initial; sp->rtoinfo.srto_max = rto_max; sp->rtoinfo.srto_min = rto_min; } return 0; } /* * * 7.1.2 SCTP_ASSOCINFO * * This option is used to tune the maximum retransmission attempts * of the association. * Returns an error if the new association retransmission value is * greater than the sum of the retransmission value of the peer. * See [SCTP] for more information. * */ static int sctp_setsockopt_associnfo(struct sock *sk, struct sctp_assocparams *assocparams, unsigned int optlen) { struct sctp_association *asoc; if (optlen != sizeof(struct sctp_assocparams)) return -EINVAL; asoc = sctp_id2assoc(sk, assocparams->sasoc_assoc_id); if (!asoc && assocparams->sasoc_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Set the values to the specific association */ if (asoc) { if (assocparams->sasoc_asocmaxrxt != 0) { __u32 path_sum = 0; int paths = 0; struct sctp_transport *peer_addr; list_for_each_entry(peer_addr, &asoc->peer.transport_addr_list, transports) { path_sum += peer_addr->pathmaxrxt; paths++; } /* Only validate asocmaxrxt if we have more than * one path/transport. We do this because path * retransmissions are only counted when we have more * then one path. */ if (paths > 1 && assocparams->sasoc_asocmaxrxt > path_sum) return -EINVAL; asoc->max_retrans = assocparams->sasoc_asocmaxrxt; } if (assocparams->sasoc_cookie_life != 0) asoc->cookie_life = ms_to_ktime(assocparams->sasoc_cookie_life); } else { /* Set the values to the endpoint */ struct sctp_sock *sp = sctp_sk(sk); if (assocparams->sasoc_asocmaxrxt != 0) sp->assocparams.sasoc_asocmaxrxt = assocparams->sasoc_asocmaxrxt; if (assocparams->sasoc_cookie_life != 0) sp->assocparams.sasoc_cookie_life = assocparams->sasoc_cookie_life; } return 0; } /* * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR) * * This socket option is a boolean flag which turns on or off mapped V4 * addresses. If this option is turned on and the socket is type * PF_INET6, then IPv4 addresses will be mapped to V6 representation. * If this option is turned off, then no mapping will be done of V4 * addresses and a user will receive both PF_INET6 and PF_INET type * addresses on the socket. */ static int sctp_setsockopt_mappedv4(struct sock *sk, int *val, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); if (optlen < sizeof(int)) return -EINVAL; if (*val) sp->v4mapped = 1; else sp->v4mapped = 0; return 0; } /* * 8.1.16. Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG) * This option will get or set the maximum size to put in any outgoing * SCTP DATA chunk. If a message is larger than this size it will be * fragmented by SCTP into the specified size. Note that the underlying * SCTP implementation may fragment into smaller sized chunks when the * PMTU of the underlying association is smaller than the value set by * the user. The default value for this option is '0' which indicates * the user is NOT limiting fragmentation and only the PMTU will effect * SCTP's choice of DATA chunk size. Note also that values set larger * than the maximum size of an IP datagram will effectively let SCTP * control fragmentation (i.e. the same as setting this option to 0). * * The following structure is used to access and modify this parameter: * * struct sctp_assoc_value { * sctp_assoc_t assoc_id; * uint32_t assoc_value; * }; * * assoc_id: This parameter is ignored for one-to-one style sockets. * For one-to-many style sockets this parameter indicates which * association the user is performing an action upon. Note that if * this field's value is zero then the endpoints default value is * changed (effecting future associations only). * assoc_value: This parameter specifies the maximum size in bytes. */ static int sctp_setsockopt_maxseg(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; sctp_assoc_t assoc_id; int val; if (optlen == sizeof(int)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of int in maxseg socket option.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); assoc_id = SCTP_FUTURE_ASSOC; val = *(int *)params; } else if (optlen == sizeof(struct sctp_assoc_value)) { assoc_id = params->assoc_id; val = params->assoc_value; } else { return -EINVAL; } asoc = sctp_id2assoc(sk, assoc_id); if (!asoc && assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (val) { int min_len, max_len; __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) : sizeof(struct sctp_data_chunk); min_len = sctp_min_frag_point(sp, datasize); max_len = SCTP_MAX_CHUNK_LEN - datasize; if (val < min_len || val > max_len) return -EINVAL; } if (asoc) { asoc->user_frag = val; sctp_assoc_update_frag_point(asoc); } else { sp->user_frag = val; } return 0; } /* * 7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR) * * Requests that the peer mark the enclosed address as the association * primary. The enclosed address must be one of the association's * locally bound addresses. The following structure is used to make a * set primary request: */ static int sctp_setsockopt_peer_primary_addr(struct sock *sk, struct sctp_setpeerprim *prim, unsigned int optlen) { struct sctp_sock *sp; struct sctp_association *asoc = NULL; struct sctp_chunk *chunk; struct sctp_af *af; int err; sp = sctp_sk(sk); if (!sp->ep->asconf_enable) return -EPERM; if (optlen != sizeof(struct sctp_setpeerprim)) return -EINVAL; asoc = sctp_id2assoc(sk, prim->sspp_assoc_id); if (!asoc) return -EINVAL; if (!asoc->peer.asconf_capable) return -EPERM; if (asoc->peer.addip_disabled_mask & SCTP_PARAM_SET_PRIMARY) return -EPERM; if (!sctp_state(asoc, ESTABLISHED)) return -ENOTCONN; af = sctp_get_af_specific(prim->sspp_addr.ss_family); if (!af) return -EINVAL; if (!af->addr_valid((union sctp_addr *)&prim->sspp_addr, sp, NULL)) return -EADDRNOTAVAIL; if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim->sspp_addr)) return -EADDRNOTAVAIL; /* Allow security module to validate address. */ err = security_sctp_bind_connect(sk, SCTP_SET_PEER_PRIMARY_ADDR, (struct sockaddr *)&prim->sspp_addr, af->sockaddr_len); if (err) return err; /* Create an ASCONF chunk with SET_PRIMARY parameter */ chunk = sctp_make_asconf_set_prim(asoc, (union sctp_addr *)&prim->sspp_addr); if (!chunk) return -ENOMEM; err = sctp_send_asconf(asoc, chunk); pr_debug("%s: we set peer primary addr primitively\n", __func__); return err; } static int sctp_setsockopt_adaptation_layer(struct sock *sk, struct sctp_setadaptation *adapt, unsigned int optlen) { if (optlen != sizeof(struct sctp_setadaptation)) return -EINVAL; sctp_sk(sk)->adaptation_ind = adapt->ssb_adaptation_ind; return 0; } /* * 7.1.29. Set or Get the default context (SCTP_CONTEXT) * * The context field in the sctp_sndrcvinfo structure is normally only * used when a failed message is retrieved holding the value that was * sent down on the actual send call. This option allows the setting of * a default context on an association basis that will be received on * reading messages from the peer. This is especially helpful in the * one-2-many model for an application to keep some reference to an * internal state machine that is processing messages on the * association. Note that the setting of this value only effects * received messages from the peer and does not effect the value that is * saved with outbound messages. */ static int sctp_setsockopt_context(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; if (optlen != sizeof(struct sctp_assoc_value)) return -EINVAL; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { asoc->default_rcv_context = params->assoc_value; return 0; } if (sctp_style(sk, TCP)) params->assoc_id = SCTP_FUTURE_ASSOC; if (params->assoc_id == SCTP_FUTURE_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) sp->default_rcv_context = params->assoc_value; if (params->assoc_id == SCTP_CURRENT_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) asoc->default_rcv_context = params->assoc_value; return 0; } /* * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE) * * This options will at a minimum specify if the implementation is doing * fragmented interleave. Fragmented interleave, for a one to many * socket, is when subsequent calls to receive a message may return * parts of messages from different associations. Some implementations * may allow you to turn this value on or off. If so, when turned off, * no fragment interleave will occur (which will cause a head of line * blocking amongst multiple associations sharing the same one to many * socket). When this option is turned on, then each receive call may * come from a different association (thus the user must receive data * with the extended calls (e.g. sctp_recvmsg) to keep track of which * association each receive belongs to. * * This option takes a boolean value. A non-zero value indicates that * fragmented interleave is on. A value of zero indicates that * fragmented interleave is off. * * Note that it is important that an implementation that allows this * option to be turned on, have it off by default. Otherwise an unaware * application using the one to many model may become confused and act * incorrectly. */ static int sctp_setsockopt_fragment_interleave(struct sock *sk, int *val, unsigned int optlen) { if (optlen != sizeof(int)) return -EINVAL; sctp_sk(sk)->frag_interleave = !!*val; if (!sctp_sk(sk)->frag_interleave) sctp_sk(sk)->ep->intl_enable = 0; return 0; } /* * 8.1.21. Set or Get the SCTP Partial Delivery Point * (SCTP_PARTIAL_DELIVERY_POINT) * * This option will set or get the SCTP partial delivery point. This * point is the size of a message where the partial delivery API will be * invoked to help free up rwnd space for the peer. Setting this to a * lower value will cause partial deliveries to happen more often. The * calls argument is an integer that sets or gets the partial delivery * point. Note also that the call will fail if the user attempts to set * this value larger than the socket receive buffer size. * * Note that any single message having a length smaller than or equal to * the SCTP partial delivery point will be delivered in one single read * call as long as the user provided buffer is large enough to hold the * message. */ static int sctp_setsockopt_partial_delivery_point(struct sock *sk, u32 *val, unsigned int optlen) { if (optlen != sizeof(u32)) return -EINVAL; /* Note: We double the receive buffer from what the user sets * it to be, also initial rwnd is based on rcvbuf/2. */ if (*val > (sk->sk_rcvbuf >> 1)) return -EINVAL; sctp_sk(sk)->pd_point = *val; return 0; /* is this the right error code? */ } /* * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST) * * This option will allow a user to change the maximum burst of packets * that can be emitted by this association. Note that the default value * is 4, and some implementations may restrict this setting so that it * can only be lowered. * * NOTE: This text doesn't seem right. Do this on a socket basis with * future associations inheriting the socket value. */ static int sctp_setsockopt_maxburst(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; sctp_assoc_t assoc_id; u32 assoc_value; if (optlen == sizeof(int)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of int in max_burst socket option deprecated.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); assoc_id = SCTP_FUTURE_ASSOC; assoc_value = *((int *)params); } else if (optlen == sizeof(struct sctp_assoc_value)) { assoc_id = params->assoc_id; assoc_value = params->assoc_value; } else return -EINVAL; asoc = sctp_id2assoc(sk, assoc_id); if (!asoc && assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { asoc->max_burst = assoc_value; return 0; } if (sctp_style(sk, TCP)) assoc_id = SCTP_FUTURE_ASSOC; if (assoc_id == SCTP_FUTURE_ASSOC || assoc_id == SCTP_ALL_ASSOC) sp->max_burst = assoc_value; if (assoc_id == SCTP_CURRENT_ASSOC || assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &sp->ep->asocs, asocs) asoc->max_burst = assoc_value; return 0; } /* * 7.1.18. Add a chunk that must be authenticated (SCTP_AUTH_CHUNK) * * This set option adds a chunk type that the user is requesting to be * received only in an authenticated way. Changes to the list of chunks * will only effect future associations on the socket. */ static int sctp_setsockopt_auth_chunk(struct sock *sk, struct sctp_authchunk *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; if (!ep->auth_enable) return -EACCES; if (optlen != sizeof(struct sctp_authchunk)) return -EINVAL; switch (val->sauth_chunk) { case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: case SCTP_CID_AUTH: return -EINVAL; } /* add this chunk id to the endpoint */ return sctp_auth_ep_add_chunkid(ep, val->sauth_chunk); } /* * 7.1.19. Get or set the list of supported HMAC Identifiers (SCTP_HMAC_IDENT) * * This option gets or sets the list of HMAC algorithms that the local * endpoint requires the peer to use. */ static int sctp_setsockopt_hmac_ident(struct sock *sk, struct sctp_hmacalgo *hmacs, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; u32 idents; if (!ep->auth_enable) return -EACCES; if (optlen < sizeof(struct sctp_hmacalgo)) return -EINVAL; optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + SCTP_AUTH_NUM_HMACS * sizeof(u16)); idents = hmacs->shmac_num_idents; if (idents == 0 || idents > SCTP_AUTH_NUM_HMACS || (idents * sizeof(u16)) > (optlen - sizeof(struct sctp_hmacalgo))) return -EINVAL; return sctp_auth_ep_set_hmacs(ep, hmacs); } /* * 7.1.20. Set a shared key (SCTP_AUTH_KEY) * * This option will set a shared secret key which is used to build an * association shared key. */ static int sctp_setsockopt_auth_key(struct sock *sk, struct sctp_authkey *authkey, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int ret = -EINVAL; if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; /* authkey->sca_keylength is u16, so optlen can't be bigger than * this. */ optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(*authkey)); if (authkey->sca_keylength > optlen - sizeof(*authkey)) goto out; asoc = sctp_id2assoc(sk, authkey->sca_assoc_id); if (!asoc && authkey->sca_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) { ret = sctp_auth_set_key(ep, asoc, authkey); goto out; } if (sctp_style(sk, TCP)) authkey->sca_assoc_id = SCTP_FUTURE_ASSOC; if (authkey->sca_assoc_id == SCTP_FUTURE_ASSOC || authkey->sca_assoc_id == SCTP_ALL_ASSOC) { ret = sctp_auth_set_key(ep, asoc, authkey); if (ret) goto out; } ret = 0; if (authkey->sca_assoc_id == SCTP_CURRENT_ASSOC || authkey->sca_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_set_key(ep, asoc, authkey); if (res && !ret) ret = res; } } out: memzero_explicit(authkey, optlen); return ret; } /* * 7.1.21. Get or set the active shared key (SCTP_AUTH_ACTIVE_KEY) * * This option will get or set the active shared key to be used to build * the association shared key. */ static int sctp_setsockopt_active_key(struct sock *sk, struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; asoc = sctp_id2assoc(sk, val->scact_assoc_id); if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) val->scact_assoc_id = SCTP_FUTURE_ASSOC; if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { ret = sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (ret) return ret; } if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_set_active_key(ep, asoc, val->scact_keynumber); if (res && !ret) ret = res; } } return ret; } /* * 7.1.22. Delete a shared key (SCTP_AUTH_DELETE_KEY) * * This set option will delete a shared secret key from use. */ static int sctp_setsockopt_del_key(struct sock *sk, struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; asoc = sctp_id2assoc(sk, val->scact_assoc_id); if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) val->scact_assoc_id = SCTP_FUTURE_ASSOC; if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { ret = sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (ret) return ret; } if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_del_key_id(ep, asoc, val->scact_keynumber); if (res && !ret) ret = res; } } return ret; } /* * 8.3.4 Deactivate a Shared Key (SCTP_AUTH_DEACTIVATE_KEY) * * This set option will deactivate a shared secret key. */ static int sctp_setsockopt_deactivate_key(struct sock *sk, struct sctp_authkeyid *val, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int ret = 0; if (optlen != sizeof(struct sctp_authkeyid)) return -EINVAL; asoc = sctp_id2assoc(sk, val->scact_assoc_id); if (!asoc && val->scact_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (sctp_style(sk, TCP)) val->scact_assoc_id = SCTP_FUTURE_ASSOC; if (val->scact_assoc_id == SCTP_FUTURE_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { ret = sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (ret) return ret; } if (val->scact_assoc_id == SCTP_CURRENT_ASSOC || val->scact_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &ep->asocs, asocs) { int res = sctp_auth_deact_key_id(ep, asoc, val->scact_keynumber); if (res && !ret) ret = res; } } return ret; } /* * 8.1.23 SCTP_AUTO_ASCONF * * This option will enable or disable the use of the automatic generation of * ASCONF chunks to add and delete addresses to an existing association. Note * that this option has two caveats namely: a) it only affects sockets that * are bound to all addresses available to the SCTP stack, and b) the system * administrator may have an overriding control that turns the ASCONF feature * off no matter what setting the socket option may have. * This option expects an integer boolean flag, where a non-zero value turns on * the option, and a zero value turns off the option. * Note. In this implementation, socket operation overrides default parameter * being set by sysctl as well as FreeBSD implementation */ static int sctp_setsockopt_auto_asconf(struct sock *sk, int *val, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); if (optlen < sizeof(int)) return -EINVAL; if (!sctp_is_ep_boundall(sk) && *val) return -EINVAL; if ((*val && sp->do_auto_asconf) || (!*val && !sp->do_auto_asconf)) return 0; spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); if (*val == 0 && sp->do_auto_asconf) { list_del(&sp->auto_asconf_list); sp->do_auto_asconf = 0; } else if (*val && !sp->do_auto_asconf) { list_add_tail(&sp->auto_asconf_list, &sock_net(sk)->sctp.auto_asconf_splist); sp->do_auto_asconf = 1; } spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock); return 0; } /* * SCTP_PEER_ADDR_THLDS * * This option allows us to alter the partially failed threshold for one or all * transports in an association. See Section 6.1 of: * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_setsockopt_paddr_thresholds(struct sock *sk, struct sctp_paddrthlds_v2 *val, unsigned int optlen, bool v2) { struct sctp_transport *trans; struct sctp_association *asoc; int len; len = v2 ? sizeof(*val) : sizeof(struct sctp_paddrthlds); if (optlen < len) return -EINVAL; if (v2 && val->spt_pathpfthld > val->spt_pathcpthld) return -EINVAL; if (!sctp_is_any(sk, (const union sctp_addr *)&val->spt_address)) { trans = sctp_addr_id2transport(sk, &val->spt_address, val->spt_assoc_id); if (!trans) return -ENOENT; if (val->spt_pathmaxrxt) trans->pathmaxrxt = val->spt_pathmaxrxt; if (v2) trans->ps_retrans = val->spt_pathcpthld; trans->pf_retrans = val->spt_pathpfthld; return 0; } asoc = sctp_id2assoc(sk, val->spt_assoc_id); if (!asoc && val->spt_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { if (val->spt_pathmaxrxt) trans->pathmaxrxt = val->spt_pathmaxrxt; if (v2) trans->ps_retrans = val->spt_pathcpthld; trans->pf_retrans = val->spt_pathpfthld; } if (val->spt_pathmaxrxt) asoc->pathmaxrxt = val->spt_pathmaxrxt; if (v2) asoc->ps_retrans = val->spt_pathcpthld; asoc->pf_retrans = val->spt_pathpfthld; } else { struct sctp_sock *sp = sctp_sk(sk); if (val->spt_pathmaxrxt) sp->pathmaxrxt = val->spt_pathmaxrxt; if (v2) sp->ps_retrans = val->spt_pathcpthld; sp->pf_retrans = val->spt_pathpfthld; } return 0; } static int sctp_setsockopt_recvrcvinfo(struct sock *sk, int *val, unsigned int optlen) { if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->recvrcvinfo = (*val == 0) ? 0 : 1; return 0; } static int sctp_setsockopt_recvnxtinfo(struct sock *sk, int *val, unsigned int optlen) { if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->recvnxtinfo = (*val == 0) ? 0 : 1; return 0; } static int sctp_setsockopt_pr_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; if (optlen != sizeof(*params)) return -EINVAL; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; sctp_sk(sk)->ep->prsctp_enable = !!params->assoc_value; return 0; } static int sctp_setsockopt_default_prinfo(struct sock *sk, struct sctp_default_prinfo *info, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*info)) goto out; if (info->pr_policy & ~SCTP_PR_SCTP_MASK) goto out; if (info->pr_policy == SCTP_PR_SCTP_NONE) info->pr_value = 0; asoc = sctp_id2assoc(sk, info->pr_assoc_id); if (!asoc && info->pr_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; retval = 0; if (asoc) { SCTP_PR_SET_POLICY(asoc->default_flags, info->pr_policy); asoc->default_timetolive = info->pr_value; goto out; } if (sctp_style(sk, TCP)) info->pr_assoc_id = SCTP_FUTURE_ASSOC; if (info->pr_assoc_id == SCTP_FUTURE_ASSOC || info->pr_assoc_id == SCTP_ALL_ASSOC) { SCTP_PR_SET_POLICY(sp->default_flags, info->pr_policy); sp->default_timetolive = info->pr_value; } if (info->pr_assoc_id == SCTP_CURRENT_ASSOC || info->pr_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { SCTP_PR_SET_POLICY(asoc->default_flags, info->pr_policy); asoc->default_timetolive = info->pr_value; } } out: return retval; } static int sctp_setsockopt_reconfig_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; sctp_sk(sk)->ep->reconf_enable = !!params->assoc_value; retval = 0; out: return retval; } static int sctp_setsockopt_enable_strreset(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; if (params->assoc_value & (~SCTP_ENABLE_STRRESET_MASK)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) goto out; retval = 0; if (asoc) { asoc->strreset_enable = params->assoc_value; goto out; } if (sctp_style(sk, TCP)) params->assoc_id = SCTP_FUTURE_ASSOC; if (params->assoc_id == SCTP_FUTURE_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) ep->strreset_enable = params->assoc_value; if (params->assoc_id == SCTP_CURRENT_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) list_for_each_entry(asoc, &ep->asocs, asocs) asoc->strreset_enable = params->assoc_value; out: return retval; } static int sctp_setsockopt_reset_streams(struct sock *sk, struct sctp_reset_streams *params, unsigned int optlen) { struct sctp_association *asoc; if (optlen < sizeof(*params)) return -EINVAL; /* srs_number_streams is u16, so optlen can't be bigger than this. */ optlen = min_t(unsigned int, optlen, USHRT_MAX + sizeof(__u16) * sizeof(*params)); if (params->srs_number_streams * sizeof(__u16) > optlen - sizeof(*params)) return -EINVAL; asoc = sctp_id2assoc(sk, params->srs_assoc_id); if (!asoc) return -EINVAL; return sctp_send_reset_streams(asoc, params); } static int sctp_setsockopt_reset_assoc(struct sock *sk, sctp_assoc_t *associd, unsigned int optlen) { struct sctp_association *asoc; if (optlen != sizeof(*associd)) return -EINVAL; asoc = sctp_id2assoc(sk, *associd); if (!asoc) return -EINVAL; return sctp_send_reset_assoc(asoc); } static int sctp_setsockopt_add_streams(struct sock *sk, struct sctp_add_streams *params, unsigned int optlen) { struct sctp_association *asoc; if (optlen != sizeof(*params)) return -EINVAL; asoc = sctp_id2assoc(sk, params->sas_assoc_id); if (!asoc) return -EINVAL; return sctp_send_add_streams(asoc, params); } static int sctp_setsockopt_scheduler(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int retval = 0; if (optlen < sizeof(*params)) return -EINVAL; if (params->assoc_value > SCTP_SS_MAX) return -EINVAL; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_sched_set_sched(asoc, params->assoc_value); if (sctp_style(sk, TCP)) params->assoc_id = SCTP_FUTURE_ASSOC; if (params->assoc_id == SCTP_FUTURE_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) sp->default_ss = params->assoc_value; if (params->assoc_id == SCTP_CURRENT_ASSOC || params->assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { int ret = sctp_sched_set_sched(asoc, params->assoc_value); if (ret && !retval) retval = ret; } } return retval; } static int sctp_setsockopt_scheduler_value(struct sock *sk, struct sctp_stream_value *params, unsigned int optlen) { struct sctp_association *asoc; int retval = -EINVAL; if (optlen < sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_CURRENT_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) { retval = sctp_sched_set_value(asoc, params->stream_id, params->stream_value, GFP_KERNEL); goto out; } retval = 0; list_for_each_entry(asoc, &sctp_sk(sk)->ep->asocs, asocs) { int ret = sctp_sched_set_value(asoc, params->stream_id, params->stream_value, GFP_KERNEL); if (ret && !retval) /* try to return the 1st error. */ retval = ret; } out: return retval; } static int sctp_setsockopt_interleaving_supported(struct sock *sk, struct sctp_assoc_value *p, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; if (optlen < sizeof(*p)) return -EINVAL; asoc = sctp_id2assoc(sk, p->assoc_id); if (!asoc && p->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (!sock_net(sk)->sctp.intl_enable || !sp->frag_interleave) { return -EPERM; } sp->ep->intl_enable = !!p->assoc_value; return 0; } static int sctp_setsockopt_reuse_port(struct sock *sk, int *val, unsigned int optlen) { if (!sctp_style(sk, TCP)) return -EOPNOTSUPP; if (sctp_sk(sk)->ep->base.bind_addr.port) return -EFAULT; if (optlen < sizeof(int)) return -EINVAL; sctp_sk(sk)->reuse = !!*val; return 0; } static int sctp_assoc_ulpevent_type_set(struct sctp_event *param, struct sctp_association *asoc) { struct sctp_ulpevent *event; sctp_ulpevent_type_set(&asoc->subscribe, param->se_type, param->se_on); if (param->se_type == SCTP_SENDER_DRY_EVENT && param->se_on) { if (sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, GFP_USER | __GFP_NOWARN); if (!event) return -ENOMEM; asoc->stream.si->enqueue_event(&asoc->ulpq, event); } } return 0; } static int sctp_setsockopt_event(struct sock *sk, struct sctp_event *param, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; int retval = 0; if (optlen < sizeof(*param)) return -EINVAL; if (param->se_type < SCTP_SN_TYPE_BASE || param->se_type > SCTP_SN_TYPE_MAX) return -EINVAL; asoc = sctp_id2assoc(sk, param->se_assoc_id); if (!asoc && param->se_assoc_id > SCTP_ALL_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) return sctp_assoc_ulpevent_type_set(param, asoc); if (sctp_style(sk, TCP)) param->se_assoc_id = SCTP_FUTURE_ASSOC; if (param->se_assoc_id == SCTP_FUTURE_ASSOC || param->se_assoc_id == SCTP_ALL_ASSOC) sctp_ulpevent_type_set(&sp->subscribe, param->se_type, param->se_on); if (param->se_assoc_id == SCTP_CURRENT_ASSOC || param->se_assoc_id == SCTP_ALL_ASSOC) { list_for_each_entry(asoc, &sp->ep->asocs, asocs) { int ret = sctp_assoc_ulpevent_type_set(param, asoc); if (ret && !retval) retval = ret; } } return retval; } static int sctp_setsockopt_asconf_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; struct sctp_endpoint *ep; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; ep = sctp_sk(sk)->ep; ep->asconf_enable = !!params->assoc_value; if (ep->asconf_enable && ep->auth_enable) { sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); } retval = 0; out: return retval; } static int sctp_setsockopt_auth_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; struct sctp_endpoint *ep; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; ep = sctp_sk(sk)->ep; if (params->assoc_value) { retval = sctp_auth_init(ep, GFP_KERNEL); if (retval) goto out; if (ep->asconf_enable) { sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF); sctp_auth_ep_add_chunkid(ep, SCTP_CID_ASCONF_ACK); } } ep->auth_enable = !!params->assoc_value; retval = 0; out: return retval; } static int sctp_setsockopt_ecn_supported(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; sctp_sk(sk)->ep->ecn_enable = !!params->assoc_value; retval = 0; out: return retval; } static int sctp_setsockopt_pf_expose(struct sock *sk, struct sctp_assoc_value *params, unsigned int optlen) { struct sctp_association *asoc; int retval = -EINVAL; if (optlen != sizeof(*params)) goto out; if (params->assoc_value > SCTP_PF_EXPOSE_MAX) goto out; asoc = sctp_id2assoc(sk, params->assoc_id); if (!asoc && params->assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) goto out; if (asoc) asoc->pf_expose = params->assoc_value; else sctp_sk(sk)->pf_expose = params->assoc_value; retval = 0; out: return retval; } static int sctp_setsockopt_encap_port(struct sock *sk, struct sctp_udpencaps *encap, unsigned int optlen) { struct sctp_association *asoc; struct sctp_transport *t; __be16 encap_port; if (optlen != sizeof(*encap)) return -EINVAL; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ encap_port = (__force __be16)encap->sue_port; if (!sctp_is_any(sk, (union sctp_addr *)&encap->sue_address)) { t = sctp_addr_id2transport(sk, &encap->sue_address, encap->sue_assoc_id); if (!t) return -EINVAL; t->encap_port = encap_port; return 0; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, encap->sue_assoc_id); if (!asoc && encap->sue_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* If changes are for association, also apply encap_port to * each transport. */ if (asoc) { list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) t->encap_port = encap_port; asoc->encap_port = encap_port; return 0; } sctp_sk(sk)->encap_port = encap_port; return 0; } static int sctp_setsockopt_probe_interval(struct sock *sk, struct sctp_probeinterval *params, unsigned int optlen) { struct sctp_association *asoc; struct sctp_transport *t; __u32 probe_interval; if (optlen != sizeof(*params)) return -EINVAL; probe_interval = params->spi_interval; if (probe_interval && probe_interval < SCTP_PROBE_TIMER_MIN) return -EINVAL; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)¶ms->spi_address)) { t = sctp_addr_id2transport(sk, ¶ms->spi_address, params->spi_assoc_id); if (!t) return -EINVAL; t->probe_interval = msecs_to_jiffies(probe_interval); sctp_transport_pl_reset(t); return 0; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params->spi_assoc_id); if (!asoc && params->spi_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* If changes are for association, also apply probe_interval to * each transport. */ if (asoc) { list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { t->probe_interval = msecs_to_jiffies(probe_interval); sctp_transport_pl_reset(t); } asoc->probe_interval = msecs_to_jiffies(probe_interval); return 0; } sctp_sk(sk)->probe_interval = probe_interval; return 0; } /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve * socket options. Socket options are used to change the default * behavior of sockets calls. They are described in Section 7. * * The syntax is: * * ret = getsockopt(int sd, int level, int optname, void __user *optval, * int __user *optlen); * ret = setsockopt(int sd, int level, int optname, const void __user *optval, * int optlen); * * sd - the socket descript. * level - set to IPPROTO_SCTP for all SCTP options. * optname - the option name. * optval - the buffer to store the value of the option. * optlen - the size of the buffer. */ static int sctp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { void *kopt = NULL; int retval = 0; pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname); /* I can hardly begin to describe how wrong this is. This is * so broken as to be worse than useless. The API draft * REALLY is NOT helpful here... I am not convinced that the * semantics of setsockopt() with a level OTHER THAN SOL_SCTP * are at all well-founded. */ if (level != SOL_SCTP) { struct sctp_af *af = sctp_sk(sk)->pf->af; return af->setsockopt(sk, level, optname, optval, optlen); } if (optlen > 0) { /* Trim it to the biggest size sctp sockopt may need if necessary */ optlen = min_t(unsigned int, optlen, PAGE_ALIGN(USHRT_MAX + sizeof(__u16) * sizeof(struct sctp_reset_streams))); kopt = memdup_sockptr(optval, optlen); if (IS_ERR(kopt)) return PTR_ERR(kopt); } lock_sock(sk); switch (optname) { case SCTP_SOCKOPT_BINDX_ADD: /* 'optlen' is the size of the addresses buffer. */ retval = sctp_setsockopt_bindx(sk, kopt, optlen, SCTP_BINDX_ADD_ADDR); break; case SCTP_SOCKOPT_BINDX_REM: /* 'optlen' is the size of the addresses buffer. */ retval = sctp_setsockopt_bindx(sk, kopt, optlen, SCTP_BINDX_REM_ADDR); break; case SCTP_SOCKOPT_CONNECTX_OLD: /* 'optlen' is the size of the addresses buffer. */ retval = sctp_setsockopt_connectx_old(sk, kopt, optlen); break; case SCTP_SOCKOPT_CONNECTX: /* 'optlen' is the size of the addresses buffer. */ retval = sctp_setsockopt_connectx(sk, kopt, optlen); break; case SCTP_DISABLE_FRAGMENTS: retval = sctp_setsockopt_disable_fragments(sk, kopt, optlen); break; case SCTP_EVENTS: retval = sctp_setsockopt_events(sk, kopt, optlen); break; case SCTP_AUTOCLOSE: retval = sctp_setsockopt_autoclose(sk, kopt, optlen); break; case SCTP_PEER_ADDR_PARAMS: retval = sctp_setsockopt_peer_addr_params(sk, kopt, optlen); break; case SCTP_DELAYED_SACK: retval = sctp_setsockopt_delayed_ack(sk, kopt, optlen); break; case SCTP_PARTIAL_DELIVERY_POINT: retval = sctp_setsockopt_partial_delivery_point(sk, kopt, optlen); break; case SCTP_INITMSG: retval = sctp_setsockopt_initmsg(sk, kopt, optlen); break; case SCTP_DEFAULT_SEND_PARAM: retval = sctp_setsockopt_default_send_param(sk, kopt, optlen); break; case SCTP_DEFAULT_SNDINFO: retval = sctp_setsockopt_default_sndinfo(sk, kopt, optlen); break; case SCTP_PRIMARY_ADDR: retval = sctp_setsockopt_primary_addr(sk, kopt, optlen); break; case SCTP_SET_PEER_PRIMARY_ADDR: retval = sctp_setsockopt_peer_primary_addr(sk, kopt, optlen); break; case SCTP_NODELAY: retval = sctp_setsockopt_nodelay(sk, kopt, optlen); break; case SCTP_RTOINFO: retval = sctp_setsockopt_rtoinfo(sk, kopt, optlen); break; case SCTP_ASSOCINFO: retval = sctp_setsockopt_associnfo(sk, kopt, optlen); break; case SCTP_I_WANT_MAPPED_V4_ADDR: retval = sctp_setsockopt_mappedv4(sk, kopt, optlen); break; case SCTP_MAXSEG: retval = sctp_setsockopt_maxseg(sk, kopt, optlen); break; case SCTP_ADAPTATION_LAYER: retval = sctp_setsockopt_adaptation_layer(sk, kopt, optlen); break; case SCTP_CONTEXT: retval = sctp_setsockopt_context(sk, kopt, optlen); break; case SCTP_FRAGMENT_INTERLEAVE: retval = sctp_setsockopt_fragment_interleave(sk, kopt, optlen); break; case SCTP_MAX_BURST: retval = sctp_setsockopt_maxburst(sk, kopt, optlen); break; case SCTP_AUTH_CHUNK: retval = sctp_setsockopt_auth_chunk(sk, kopt, optlen); break; case SCTP_HMAC_IDENT: retval = sctp_setsockopt_hmac_ident(sk, kopt, optlen); break; case SCTP_AUTH_KEY: retval = sctp_setsockopt_auth_key(sk, kopt, optlen); break; case SCTP_AUTH_ACTIVE_KEY: retval = sctp_setsockopt_active_key(sk, kopt, optlen); break; case SCTP_AUTH_DELETE_KEY: retval = sctp_setsockopt_del_key(sk, kopt, optlen); break; case SCTP_AUTH_DEACTIVATE_KEY: retval = sctp_setsockopt_deactivate_key(sk, kopt, optlen); break; case SCTP_AUTO_ASCONF: retval = sctp_setsockopt_auto_asconf(sk, kopt, optlen); break; case SCTP_PEER_ADDR_THLDS: retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen, false); break; case SCTP_PEER_ADDR_THLDS_V2: retval = sctp_setsockopt_paddr_thresholds(sk, kopt, optlen, true); break; case SCTP_RECVRCVINFO: retval = sctp_setsockopt_recvrcvinfo(sk, kopt, optlen); break; case SCTP_RECVNXTINFO: retval = sctp_setsockopt_recvnxtinfo(sk, kopt, optlen); break; case SCTP_PR_SUPPORTED: retval = sctp_setsockopt_pr_supported(sk, kopt, optlen); break; case SCTP_DEFAULT_PRINFO: retval = sctp_setsockopt_default_prinfo(sk, kopt, optlen); break; case SCTP_RECONFIG_SUPPORTED: retval = sctp_setsockopt_reconfig_supported(sk, kopt, optlen); break; case SCTP_ENABLE_STREAM_RESET: retval = sctp_setsockopt_enable_strreset(sk, kopt, optlen); break; case SCTP_RESET_STREAMS: retval = sctp_setsockopt_reset_streams(sk, kopt, optlen); break; case SCTP_RESET_ASSOC: retval = sctp_setsockopt_reset_assoc(sk, kopt, optlen); break; case SCTP_ADD_STREAMS: retval = sctp_setsockopt_add_streams(sk, kopt, optlen); break; case SCTP_STREAM_SCHEDULER: retval = sctp_setsockopt_scheduler(sk, kopt, optlen); break; case SCTP_STREAM_SCHEDULER_VALUE: retval = sctp_setsockopt_scheduler_value(sk, kopt, optlen); break; case SCTP_INTERLEAVING_SUPPORTED: retval = sctp_setsockopt_interleaving_supported(sk, kopt, optlen); break; case SCTP_REUSE_PORT: retval = sctp_setsockopt_reuse_port(sk, kopt, optlen); break; case SCTP_EVENT: retval = sctp_setsockopt_event(sk, kopt, optlen); break; case SCTP_ASCONF_SUPPORTED: retval = sctp_setsockopt_asconf_supported(sk, kopt, optlen); break; case SCTP_AUTH_SUPPORTED: retval = sctp_setsockopt_auth_supported(sk, kopt, optlen); break; case SCTP_ECN_SUPPORTED: retval = sctp_setsockopt_ecn_supported(sk, kopt, optlen); break; case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: retval = sctp_setsockopt_pf_expose(sk, kopt, optlen); break; case SCTP_REMOTE_UDP_ENCAPS_PORT: retval = sctp_setsockopt_encap_port(sk, kopt, optlen); break; case SCTP_PLPMTUD_PROBE_INTERVAL: retval = sctp_setsockopt_probe_interval(sk, kopt, optlen); break; default: retval = -ENOPROTOOPT; break; } release_sock(sk); kfree(kopt); return retval; } /* API 3.1.6 connect() - UDP Style Syntax * * An application may use the connect() call in the UDP model to initiate an * association without sending data. * * The syntax is: * * ret = connect(int sd, const struct sockaddr *nam, socklen_t len); * * sd: the socket descriptor to have a new association added to. * * nam: the address structure (either struct sockaddr_in or struct * sockaddr_in6 defined in RFC2553 [7]). * * len: the size of the address. */ static int sctp_connect(struct sock *sk, struct sockaddr *addr, int addr_len, int flags) { struct sctp_af *af; int err = -EINVAL; lock_sock(sk); pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk, addr, addr_len); /* Validate addr_len before calling common connect/connectx routine. */ af = sctp_get_af_specific(addr->sa_family); if (af && addr_len >= af->sockaddr_len) err = __sctp_connect(sk, addr, af->sockaddr_len, flags, NULL); release_sock(sk); return err; } int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return -EOPNOTSUPP; return sctp_connect(sock->sk, uaddr, addr_len, flags); } /* FIXME: Write comments. */ static int sctp_disconnect(struct sock *sk, int flags) { return -EOPNOTSUPP; /* STUB */ } /* 4.1.4 accept() - TCP Style Syntax * * Applications use accept() call to remove an established SCTP * association from the accept queue of the endpoint. A new socket * descriptor will be returned from accept() to represent the newly * formed association. */ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern) { struct sctp_sock *sp; struct sctp_endpoint *ep; struct sock *newsk = NULL; struct sctp_association *asoc; long timeo; int error = 0; lock_sock(sk); sp = sctp_sk(sk); ep = sp->ep; if (!sctp_style(sk, TCP)) { error = -EOPNOTSUPP; goto out; } if (!sctp_sstate(sk, LISTENING)) { error = -EINVAL; goto out; } timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); error = sctp_wait_for_accept(sk, timeo); if (error) goto out; /* We treat the list of associations on the endpoint as the accept * queue and pick the first association on the list. */ asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); newsk = sp->pf->create_accept_sk(sk, asoc, kern); if (!newsk) { error = -ENOMEM; goto out; } /* Populate the fields of the newsk from the oldsk and migrate the * asoc to the newsk. */ error = sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP); if (error) { sk_common_release(newsk); newsk = NULL; } out: release_sock(sk); *err = error; return newsk; } /* The SCTP ioctl handler. */ static int sctp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; lock_sock(sk); /* * SEQPACKET-style sockets in LISTENING state are valid, for * SCTP, so only discard TCP-style sockets in LISTENING state. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) goto out; switch (cmd) { case SIOCINQ: { struct sk_buff *skb; *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* * We will only return the amount of this packet since * that is all that will be read. */ *karg = skb->len; } rc = 0; break; } default: rc = -ENOIOCTLCMD; break; } out: release_sock(sk); return rc; } /* This is the function which gets called during socket creation to * initialized the SCTP-specific portion of the sock. * The sock structure should already be zero-filled memory. */ static int sctp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); struct sctp_sock *sp; pr_debug("%s: sk:%p\n", __func__, sk); sp = sctp_sk(sk); /* Initialize the SCTP per socket area. */ switch (sk->sk_type) { case SOCK_SEQPACKET: sp->type = SCTP_SOCKET_UDP; break; case SOCK_STREAM: sp->type = SCTP_SOCKET_TCP; break; default: return -ESOCKTNOSUPPORT; } sk->sk_gso_type = SKB_GSO_SCTP; /* Initialize default send parameters. These parameters can be * modified with the SCTP_DEFAULT_SEND_PARAM socket option. */ sp->default_stream = 0; sp->default_ppid = 0; sp->default_flags = 0; sp->default_context = 0; sp->default_timetolive = 0; sp->default_rcv_context = 0; sp->max_burst = net->sctp.max_burst; sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg; /* Initialize default setup parameters. These parameters * can be modified with the SCTP_INITMSG socket option or * overridden by the SCTP_INIT CMSG. */ sp->initmsg.sinit_num_ostreams = sctp_max_outstreams; sp->initmsg.sinit_max_instreams = sctp_max_instreams; sp->initmsg.sinit_max_attempts = net->sctp.max_retrans_init; sp->initmsg.sinit_max_init_timeo = net->sctp.rto_max; /* Initialize default RTO related parameters. These parameters can * be modified for with the SCTP_RTOINFO socket option. */ sp->rtoinfo.srto_initial = net->sctp.rto_initial; sp->rtoinfo.srto_max = net->sctp.rto_max; sp->rtoinfo.srto_min = net->sctp.rto_min; /* Initialize default association related parameters. These parameters * can be modified with the SCTP_ASSOCINFO socket option. */ sp->assocparams.sasoc_asocmaxrxt = net->sctp.max_retrans_association; sp->assocparams.sasoc_number_peer_destinations = 0; sp->assocparams.sasoc_peer_rwnd = 0; sp->assocparams.sasoc_local_rwnd = 0; sp->assocparams.sasoc_cookie_life = net->sctp.valid_cookie_life; /* Initialize default event subscriptions. By default, all the * options are off. */ sp->subscribe = 0; /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS */ sp->hbinterval = net->sctp.hb_interval; sp->udp_port = htons(net->sctp.udp_port); sp->encap_port = htons(net->sctp.encap_port); sp->pathmaxrxt = net->sctp.max_retrans_path; sp->pf_retrans = net->sctp.pf_retrans; sp->ps_retrans = net->sctp.ps_retrans; sp->pf_expose = net->sctp.pf_expose; sp->pathmtu = 0; /* allow default discovery */ sp->sackdelay = net->sctp.sack_timeout; sp->sackfreq = 2; sp->param_flags = SPP_HB_ENABLE | SPP_PMTUD_ENABLE | SPP_SACKDELAY_ENABLE; sp->default_ss = SCTP_SS_DEFAULT; /* If enabled no SCTP message fragmentation will be performed. * Configure through SCTP_DISABLE_FRAGMENTS socket option. */ sp->disable_fragments = 0; /* Enable Nagle algorithm by default. */ sp->nodelay = 0; sp->recvrcvinfo = 0; sp->recvnxtinfo = 0; /* Enable by default. */ sp->v4mapped = 1; /* Auto-close idle associations after the configured * number of seconds. A value of 0 disables this * feature. Configure through the SCTP_AUTOCLOSE socket option, * for UDP-style sockets only. */ sp->autoclose = 0; /* User specified fragmentation limit. */ sp->user_frag = 0; sp->adaptation_ind = 0; sp->pf = sctp_get_pf_specific(sk->sk_family); /* Control variables for partial data delivery. */ atomic_set(&sp->pd_mode, 0); skb_queue_head_init(&sp->pd_lobby); sp->frag_interleave = 0; sp->probe_interval = net->sctp.probe_interval; /* Create a per socket endpoint structure. Even if we * change the data structure relationships, this may still * be useful for storing pre-connect address information. */ sp->ep = sctp_endpoint_new(sk, GFP_KERNEL); if (!sp->ep) return -ENOMEM; sp->hmac = NULL; sk->sk_destruct = sctp_destruct_sock; SCTP_DBG_OBJCNT_INC(sock); sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); return 0; } /* Cleanup any SCTP per socket resources. Must be called with * sock_net(sk)->sctp.addr_wq_lock held if sp->do_auto_asconf is true */ static void sctp_destroy_sock(struct sock *sk) { struct sctp_sock *sp; pr_debug("%s: sk:%p\n", __func__, sk); /* Release our hold on the endpoint. */ sp = sctp_sk(sk); /* This could happen during socket init, thus we bail out * early, since the rest of the below is not setup either. */ if (sp->ep == NULL) return; if (sp->do_auto_asconf) { sp->do_auto_asconf = 0; list_del(&sp->auto_asconf_list); } sctp_endpoint_free(sp->ep); sk_sockets_allocated_dec(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } /* Triggered when there are no references on the socket anymore */ static void sctp_destruct_common(struct sock *sk) { struct sctp_sock *sp = sctp_sk(sk); /* Free up the HMAC transform. */ crypto_free_shash(sp->hmac); } static void sctp_destruct_sock(struct sock *sk) { sctp_destruct_common(sk); inet_sock_destruct(sk); } /* API 4.1.7 shutdown() - TCP Style Syntax * int shutdown(int socket, int how); * * sd - the socket descriptor of the association to be closed. * how - Specifies the type of shutdown. The values are * as follows: * SHUT_RD * Disables further receive operations. No SCTP * protocol action is taken. * SHUT_WR * Disables further send operations, and initiates * the SCTP shutdown sequence. * SHUT_RDWR * Disables further send and receive operations * and initiates the SCTP shutdown sequence. */ static void sctp_shutdown(struct sock *sk, int how) { struct net *net = sock_net(sk); struct sctp_endpoint *ep; if (!sctp_style(sk, TCP)) return; ep = sctp_sk(sk)->ep; if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) { struct sctp_association *asoc; inet_sk_set_state(sk, SCTP_SS_CLOSING); asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); sctp_primitive_SHUTDOWN(net, asoc, NULL); } } int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info) { struct sctp_transport *prim; struct list_head *pos; int mask; memset(info, 0, sizeof(*info)); if (!asoc) { struct sctp_sock *sp = sctp_sk(sk); info->sctpi_s_autoclose = sp->autoclose; info->sctpi_s_adaptation_ind = sp->adaptation_ind; info->sctpi_s_pd_point = sp->pd_point; info->sctpi_s_nodelay = sp->nodelay; info->sctpi_s_disable_fragments = sp->disable_fragments; info->sctpi_s_v4mapped = sp->v4mapped; info->sctpi_s_frag_interleave = sp->frag_interleave; info->sctpi_s_type = sp->type; return 0; } info->sctpi_tag = asoc->c.my_vtag; info->sctpi_state = asoc->state; info->sctpi_rwnd = asoc->a_rwnd; info->sctpi_unackdata = asoc->unack_data; info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); info->sctpi_instrms = asoc->stream.incnt; info->sctpi_outstrms = asoc->stream.outcnt; list_for_each(pos, &asoc->base.inqueue.in_chunk_list) info->sctpi_inqueue++; list_for_each(pos, &asoc->outqueue.out_chunk_list) info->sctpi_outqueue++; info->sctpi_overall_error = asoc->overall_error_count; info->sctpi_max_burst = asoc->max_burst; info->sctpi_maxseg = asoc->frag_point; info->sctpi_peer_rwnd = asoc->peer.rwnd; info->sctpi_peer_tag = asoc->c.peer_vtag; mask = asoc->peer.intl_capable << 1; mask = (mask | asoc->peer.ecn_capable) << 1; mask = (mask | asoc->peer.ipv4_address) << 1; mask = (mask | asoc->peer.ipv6_address) << 1; mask = (mask | asoc->peer.reconf_capable) << 1; mask = (mask | asoc->peer.asconf_capable) << 1; mask = (mask | asoc->peer.prsctp_capable) << 1; mask = (mask | asoc->peer.auth_capable); info->sctpi_peer_capable = mask; mask = asoc->peer.sack_needed << 1; mask = (mask | asoc->peer.sack_generation) << 1; mask = (mask | asoc->peer.zero_window_announced); info->sctpi_peer_sack = mask; info->sctpi_isacks = asoc->stats.isacks; info->sctpi_osacks = asoc->stats.osacks; info->sctpi_opackets = asoc->stats.opackets; info->sctpi_ipackets = asoc->stats.ipackets; info->sctpi_rtxchunks = asoc->stats.rtxchunks; info->sctpi_outofseqtsns = asoc->stats.outofseqtsns; info->sctpi_idupchunks = asoc->stats.idupchunks; info->sctpi_gapcnt = asoc->stats.gapcnt; info->sctpi_ouodchunks = asoc->stats.ouodchunks; info->sctpi_iuodchunks = asoc->stats.iuodchunks; info->sctpi_oodchunks = asoc->stats.oodchunks; info->sctpi_iodchunks = asoc->stats.iodchunks; info->sctpi_octrlchunks = asoc->stats.octrlchunks; info->sctpi_ictrlchunks = asoc->stats.ictrlchunks; prim = asoc->peer.primary_path; memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr)); info->sctpi_p_state = prim->state; info->sctpi_p_cwnd = prim->cwnd; info->sctpi_p_srtt = prim->srtt; info->sctpi_p_rto = jiffies_to_msecs(prim->rto); info->sctpi_p_hbinterval = prim->hbinterval; info->sctpi_p_pathmaxrxt = prim->pathmaxrxt; info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay); info->sctpi_p_ssthresh = prim->ssthresh; info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked; info->sctpi_p_flight_size = prim->flight_size; info->sctpi_p_error = prim->error_count; return 0; } EXPORT_SYMBOL_GPL(sctp_get_sctp_info); /* use callback to avoid exporting the core structure */ void sctp_transport_walk_start(struct rhashtable_iter *iter) __acquires(RCU) { rhltable_walk_enter(&sctp_transport_hashtable, iter); rhashtable_walk_start(iter); } void sctp_transport_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { rhashtable_walk_stop(iter); rhashtable_walk_exit(iter); } struct sctp_transport *sctp_transport_get_next(struct net *net, struct rhashtable_iter *iter) { struct sctp_transport *t; t = rhashtable_walk_next(iter); for (; t; t = rhashtable_walk_next(iter)) { if (IS_ERR(t)) { if (PTR_ERR(t) == -EAGAIN) continue; break; } if (!sctp_transport_hold(t)) continue; if (net_eq(t->asoc->base.net, net) && t->asoc->peer.primary_path == t) break; sctp_transport_put(t); } return t; } struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos) { struct sctp_transport *t; if (!pos) return SEQ_START_TOKEN; while ((t = sctp_transport_get_next(net, iter)) && !IS_ERR(t)) { if (!--pos) break; sctp_transport_put(t); } return t; } int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p) { int err = 0; int hash = 0; struct sctp_endpoint *ep; struct sctp_hashbucket *head; for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize; hash++, head++) { read_lock_bh(&head->lock); sctp_for_each_hentry(ep, &head->chain) { err = cb(ep, p); if (err) break; } read_unlock_bh(&head->lock); } return err; } EXPORT_SYMBOL_GPL(sctp_for_each_endpoint); int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, void *p, int dif) { struct sctp_transport *transport; struct sctp_endpoint *ep; int err = -ENOENT; rcu_read_lock(); transport = sctp_addrs_lookup_transport(net, laddr, paddr, dif, dif); if (!transport) { rcu_read_unlock(); return err; } ep = transport->asoc->ep; if (!sctp_endpoint_hold(ep)) { /* asoc can be peeled off */ sctp_transport_put(transport); rcu_read_unlock(); return err; } rcu_read_unlock(); err = cb(ep, transport, p); sctp_endpoint_put(ep); sctp_transport_put(transport); return err; } EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, struct net *net, int *pos, void *p) { struct rhashtable_iter hti; struct sctp_transport *tsp; struct sctp_endpoint *ep; int ret; again: ret = 0; sctp_transport_walk_start(&hti); tsp = sctp_transport_get_idx(net, &hti, *pos + 1); for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { ep = tsp->asoc->ep; if (sctp_endpoint_hold(ep)) { /* asoc can be peeled off */ ret = cb(ep, tsp, p); if (ret) break; sctp_endpoint_put(ep); } (*pos)++; sctp_transport_put(tsp); } sctp_transport_walk_stop(&hti); if (ret) { if (cb_done && !cb_done(ep, tsp, p)) { (*pos)++; sctp_endpoint_put(ep); sctp_transport_put(tsp); goto again; } sctp_endpoint_put(ep); sctp_transport_put(tsp); } return ret; } EXPORT_SYMBOL_GPL(sctp_transport_traverse_process); /* 7.2.1 Association Status (SCTP_STATUS) * Applications can retrieve current status information about an * association, including association state, peer receiver window size, * number of unacked data chunks, and number of data chunks pending * receipt. This information is read-only. */ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_status status; struct sctp_association *asoc = NULL; struct sctp_transport *transport; sctp_assoc_t associd; int retval = 0; if (len < sizeof(status)) { retval = -EINVAL; goto out; } len = sizeof(status); if (copy_from_user(&status, optval, len)) { retval = -EFAULT; goto out; } associd = status.sstat_assoc_id; asoc = sctp_id2assoc(sk, associd); if (!asoc) { retval = -EINVAL; goto out; } transport = asoc->peer.primary_path; status.sstat_assoc_id = sctp_assoc2id(asoc); status.sstat_state = sctp_assoc_to_state(asoc); status.sstat_rwnd = asoc->peer.rwnd; status.sstat_unackdata = asoc->unack_data; status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); status.sstat_instrms = asoc->stream.incnt; status.sstat_outstrms = asoc->stream.outcnt; status.sstat_fragmentation_point = asoc->frag_point; status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc); memcpy(&status.sstat_primary.spinfo_address, &transport->ipaddr, transport->af_specific->sockaddr_len); /* Map ipv4 address into v4-mapped-on-v6 address. */ sctp_get_pf_specific(sk->sk_family)->addr_to_user(sctp_sk(sk), (union sctp_addr *)&status.sstat_primary.spinfo_address); status.sstat_primary.spinfo_state = transport->state; status.sstat_primary.spinfo_cwnd = transport->cwnd; status.sstat_primary.spinfo_srtt = transport->srtt; status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); status.sstat_primary.spinfo_mtu = transport->pathmtu; if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) status.sstat_primary.spinfo_state = SCTP_ACTIVE; if (put_user(len, optlen)) { retval = -EFAULT; goto out; } pr_debug("%s: len:%d, state:%d, rwnd:%d, assoc_id:%d\n", __func__, len, status.sstat_state, status.sstat_rwnd, status.sstat_assoc_id); if (copy_to_user(optval, &status, len)) { retval = -EFAULT; goto out; } out: return retval; } /* 7.2.2 Peer Address Information (SCTP_GET_PEER_ADDR_INFO) * * Applications can retrieve information about a specific peer address * of an association, including its reachability state, congestion * window, and retransmission timer values. This information is * read-only. */ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_paddrinfo pinfo; struct sctp_transport *transport; int retval = 0; if (len < sizeof(pinfo)) { retval = -EINVAL; goto out; } len = sizeof(pinfo); if (copy_from_user(&pinfo, optval, len)) { retval = -EFAULT; goto out; } transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address, pinfo.spinfo_assoc_id); if (!transport) { retval = -EINVAL; goto out; } if (transport->state == SCTP_PF && transport->asoc->pf_expose == SCTP_PF_EXPOSE_DISABLE) { retval = -EACCES; goto out; } pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); pinfo.spinfo_state = transport->state; pinfo.spinfo_cwnd = transport->cwnd; pinfo.spinfo_srtt = transport->srtt; pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); pinfo.spinfo_mtu = transport->pathmtu; if (pinfo.spinfo_state == SCTP_UNKNOWN) pinfo.spinfo_state = SCTP_ACTIVE; if (put_user(len, optlen)) { retval = -EFAULT; goto out; } if (copy_to_user(optval, &pinfo, len)) { retval = -EFAULT; goto out; } out: return retval; } /* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) * * This option is a on/off flag. If enabled no SCTP message * fragmentation will be performed. Instead if a message being sent * exceeds the current PMTU size, the message will NOT be sent and * instead a error will be indicated to the user. */ static int sctp_getsockopt_disable_fragments(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = (sctp_sk(sk)->disable_fragments == 1); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* 7.1.15 Set notification and ancillary events (SCTP_EVENTS) * * This socket option is used to specify various notifications and * ancillary data the user wishes to receive. */ static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_event_subscribe subscribe; __u8 *sn_type = (__u8 *)&subscribe; int i; if (len == 0) return -EINVAL; if (len > sizeof(struct sctp_event_subscribe)) len = sizeof(struct sctp_event_subscribe); if (put_user(len, optlen)) return -EFAULT; for (i = 0; i < len; i++) sn_type[i] = sctp_ulpevent_type_enabled(sctp_sk(sk)->subscribe, SCTP_SN_TYPE_BASE + i); if (copy_to_user(optval, &subscribe, len)) return -EFAULT; return 0; } /* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE) * * This socket option is applicable to the UDP-style socket only. When * set it will cause associations that are idle for more than the * specified number of seconds to automatically close. An association * being idle is defined an association that has NOT sent or received * user data. The special value of '0' indicates that no automatic * close of any associations should be performed. The option expects an * integer defining the number of seconds of idle time before an * association is closed. */ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optval, int __user *optlen) { /* Applicable to UDP-style socket only */ if (sctp_style(sk, TCP)) return -EOPNOTSUPP; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); if (put_user(len, optlen)) return -EFAULT; if (put_user(sctp_sk(sk)->autoclose, (int __user *)optval)) return -EFAULT; return 0; } /* Helper routine to branch off an association to a new socket. */ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) { struct sctp_association *asoc = sctp_id2assoc(sk, id); struct sctp_sock *sp = sctp_sk(sk); struct socket *sock; int err = 0; /* Do not peel off from one netns to another one. */ if (!net_eq(current->nsproxy->net_ns, sock_net(sk))) return -EINVAL; if (!asoc) return -EINVAL; /* An association cannot be branched off from an already peeled-off * socket, nor is this supported for tcp style sockets. */ if (!sctp_style(sk, UDP)) return -EINVAL; /* Create a new socket. */ err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock); if (err < 0) return err; sctp_copy_sock(sock->sk, sk, asoc); /* Make peeled-off sockets more like 1-1 accepted sockets. * Set the daddr and initialize id to something more random and also * copy over any ip options. */ sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sock->sk); sp->pf->copy_ip_options(sk, sock->sk); /* Populate the fields of the newsk from the oldsk and migrate the * asoc to the newsk. */ err = sctp_sock_migrate(sk, sock->sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH); if (err) { sock_release(sock); sock = NULL; } *sockp = sock; return err; } EXPORT_SYMBOL(sctp_do_peeloff); static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *peeloff, struct file **newfile, unsigned flags) { struct socket *newsock; int retval; retval = sctp_do_peeloff(sk, peeloff->associd, &newsock); if (retval < 0) goto out; /* Map the socket to an unused fd that can be returned to the user. */ retval = get_unused_fd_flags(flags & SOCK_CLOEXEC); if (retval < 0) { sock_release(newsock); goto out; } *newfile = sock_alloc_file(newsock, 0, NULL); if (IS_ERR(*newfile)) { put_unused_fd(retval); retval = PTR_ERR(*newfile); *newfile = NULL; return retval; } pr_debug("%s: sk:%p, newsk:%p, sd:%d\n", __func__, sk, newsock->sk, retval); peeloff->sd = retval; if (flags & SOCK_NONBLOCK) (*newfile)->f_flags |= O_NONBLOCK; out: return retval; } static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval, int __user *optlen) { sctp_peeloff_arg_t peeloff; struct file *newfile = NULL; int retval = 0; if (len < sizeof(sctp_peeloff_arg_t)) return -EINVAL; len = sizeof(sctp_peeloff_arg_t); if (copy_from_user(&peeloff, optval, len)) return -EFAULT; retval = sctp_getsockopt_peeloff_common(sk, &peeloff, &newfile, 0); if (retval < 0) goto out; /* Return the fd mapped to the new socket. */ if (put_user(len, optlen)) { fput(newfile); put_unused_fd(retval); return -EFAULT; } if (copy_to_user(optval, &peeloff, len)) { fput(newfile); put_unused_fd(retval); return -EFAULT; } fd_install(retval, newfile); out: return retval; } static int sctp_getsockopt_peeloff_flags(struct sock *sk, int len, char __user *optval, int __user *optlen) { sctp_peeloff_flags_arg_t peeloff; struct file *newfile = NULL; int retval = 0; if (len < sizeof(sctp_peeloff_flags_arg_t)) return -EINVAL; len = sizeof(sctp_peeloff_flags_arg_t); if (copy_from_user(&peeloff, optval, len)) return -EFAULT; retval = sctp_getsockopt_peeloff_common(sk, &peeloff.p_arg, &newfile, peeloff.flags); if (retval < 0) goto out; /* Return the fd mapped to the new socket. */ if (put_user(len, optlen)) { fput(newfile); put_unused_fd(retval); return -EFAULT; } if (copy_to_user(optval, &peeloff, len)) { fput(newfile); put_unused_fd(retval); return -EFAULT; } fd_install(retval, newfile); out: return retval; } /* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS) * * Applications can enable or disable heartbeats for any peer address of * an association, modify an address's heartbeat interval, force a * heartbeat to be sent immediately, and adjust the address's maximum * number of retransmissions sent before an address is considered * unreachable. The following structure is used to access and modify an * address's parameters: * * struct sctp_paddrparams { * sctp_assoc_t spp_assoc_id; * struct sockaddr_storage spp_address; * uint32_t spp_hbinterval; * uint16_t spp_pathmaxrxt; * uint32_t spp_pathmtu; * uint32_t spp_sackdelay; * uint32_t spp_flags; * }; * * spp_assoc_id - (one-to-many style socket) This is filled in the * application, and identifies the association for * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, * in milliseconds. If a value of zero * is present in this field then no changes are to * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be * considered unreachable. If a value of zero * is present in this field then no changes are to * be made to this parameter. * spp_pathmtu - When Path MTU discovery is disabled the value * specified here will be the "fixed" path mtu. * Note that if the spp_address field is empty * then all associations on this address will * have this fixed path mtu set upon them. * * spp_sackdelay - When delayed sack is enabled, this value specifies * the number of milliseconds that sacks will be delayed * for. This value will apply to all addresses of an * association if the spp_address field is empty. Note * also, that if delayed sack is enabled and this * value is set to 0, no change is made to the last * recorded delayed sack timer value. * * spp_flags - These flags are used to control various features * on an association. The flag field may contain * zero or more of the following options. * * SPP_HB_ENABLE - Enable heartbeats on the * specified address. Note that if the address * field is empty all addresses for the association * have heartbeats enabled upon them. * * SPP_HB_DISABLE - Disable heartbeats on the * speicifed address. Note that if the address * field is empty all addresses for the association * will have their heartbeats disabled. Note also * that SPP_HB_ENABLE and SPP_HB_DISABLE are * mutually exclusive, only one of these two should * be specified. Enabling both fields will have * undetermined results. * * SPP_HB_DEMAND - Request a user initiated heartbeat * to be made immediately. * * SPP_PMTUD_ENABLE - This field will enable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses * on the association are effected. * * SPP_PMTUD_DISABLE - This field will disable PMTU * discovery upon the specified address. Note that * if the address feild is empty then all addresses * on the association are effected. Not also that * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually * exclusive. Enabling both will have undetermined * results. * * SPP_SACKDELAY_ENABLE - Setting this flag turns * on delayed sack. The time specified in spp_sackdelay * is used to specify the sack delay for this address. Note * that if spp_address is empty then all addresses will * enable delayed sack and take on the sack delay * value specified in spp_sackdelay. * SPP_SACKDELAY_DISABLE - Setting this flag turns * off delayed sack. If the spp_address field is blank then * delayed sack is disabled for the entire association. Note * also that this field is mutually exclusive to * SPP_SACKDELAY_ENABLE, setting both will have undefined * results. * * SPP_IPV6_FLOWLABEL: Setting this flag enables the * setting of the IPV6 flow label value. The value is * contained in the spp_ipv6_flowlabel field. * Upon retrieval, this flag will be set to indicate that * the spp_ipv6_flowlabel field has a valid value returned. * If a specific destination address is set (in the * spp_address field), then the value returned is that of * the address. If just an association is specified (and * no address), then the association's default flow label * is returned. If neither an association nor a destination * is specified, then the socket's default flow label is * returned. For non-IPv6 sockets, this flag will be left * cleared. * * SPP_DSCP: Setting this flag enables the setting of the * Differentiated Services Code Point (DSCP) value * associated with either the association or a specific * address. The value is obtained in the spp_dscp field. * Upon retrieval, this flag will be set to indicate that * the spp_dscp field has a valid value returned. If a * specific destination address is set when called (in the * spp_address field), then that specific destination * address's DSCP value is returned. If just an association * is specified, then the association's default DSCP is * returned. If neither an association nor a destination is * specified, then the socket's default DSCP is returned. * * spp_ipv6_flowlabel * - This field is used in conjunction with the * SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label. * The 20 least significant bits are used for the flow * label. This setting has precedence over any IPv6-layer * setting. * * spp_dscp - This field is used in conjunction with the SPP_DSCP flag * and contains the DSCP. The 6 most significant bits are * used for the DSCP. This setting has precedence over any * IPv4- or IPv6- layer setting. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_paddrparams params; struct sctp_transport *trans = NULL; struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); if (len >= sizeof(params)) len = sizeof(params); else if (len >= ALIGN(offsetof(struct sctp_paddrparams, spp_ipv6_flowlabel), 4)) len = ALIGN(offsetof(struct sctp_paddrparams, spp_ipv6_flowlabel), 4); else return -EINVAL; if (copy_from_user(¶ms, optval, len)) return -EFAULT; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)¶ms.spp_address)) { trans = sctp_addr_id2transport(sk, ¶ms.spp_address, params.spp_assoc_id); if (!trans) { pr_debug("%s: failed no transport\n", __func__); return -EINVAL; } } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params.spp_assoc_id); if (!asoc && params.spp_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { pr_debug("%s: failed no association\n", __func__); return -EINVAL; } if (trans) { /* Fetch transport values. */ params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval); params.spp_pathmtu = trans->pathmtu; params.spp_pathmaxrxt = trans->pathmaxrxt; params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay); /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = trans->param_flags; if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) { params.spp_ipv6_flowlabel = trans->flowlabel & SCTP_FLOWLABEL_VAL_MASK; params.spp_flags |= SPP_IPV6_FLOWLABEL; } if (trans->dscp & SCTP_DSCP_SET_MASK) { params.spp_dscp = trans->dscp & SCTP_DSCP_VAL_MASK; params.spp_flags |= SPP_DSCP; } } else if (asoc) { /* Fetch association values. */ params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); params.spp_pathmtu = asoc->pathmtu; params.spp_pathmaxrxt = asoc->pathmaxrxt; params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay); /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = asoc->param_flags; if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) { params.spp_ipv6_flowlabel = asoc->flowlabel & SCTP_FLOWLABEL_VAL_MASK; params.spp_flags |= SPP_IPV6_FLOWLABEL; } if (asoc->dscp & SCTP_DSCP_SET_MASK) { params.spp_dscp = asoc->dscp & SCTP_DSCP_VAL_MASK; params.spp_flags |= SPP_DSCP; } } else { /* Fetch socket values. */ params.spp_hbinterval = sp->hbinterval; params.spp_pathmtu = sp->pathmtu; params.spp_sackdelay = sp->sackdelay; params.spp_pathmaxrxt = sp->pathmaxrxt; /*draft-11 doesn't say what to return in spp_flags*/ params.spp_flags = sp->param_flags; if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) { params.spp_ipv6_flowlabel = sp->flowlabel & SCTP_FLOWLABEL_VAL_MASK; params.spp_flags |= SPP_IPV6_FLOWLABEL; } if (sp->dscp & SCTP_DSCP_SET_MASK) { params.spp_dscp = sp->dscp & SCTP_DSCP_VAL_MASK; params.spp_flags |= SPP_DSCP; } } if (copy_to_user(optval, ¶ms, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } /* * 7.1.23. Get or set delayed ack timer (SCTP_DELAYED_SACK) * * This option will effect the way delayed acks are performed. This * option allows you to get or set the delayed ack time, in * milliseconds. It also allows changing the delayed ack frequency. * Changing the frequency to 1 disables the delayed sack algorithm. If * the assoc_id is 0, then this sets or gets the endpoints default * values. If the assoc_id field is non-zero, then the set or get * effects the specified association for the one to many model (the * assoc_id field is ignored by the one to one model). Note that if * sack_delay or sack_freq are 0 when setting this option, then the * current values will remain unchanged. * * struct sctp_sack_info { * sctp_assoc_t sack_assoc_id; * uint32_t sack_delay; * uint32_t sack_freq; * }; * * sack_assoc_id - This parameter, indicates which association the user * is performing an action upon. Note that if this field's value is * zero then the endpoints default value is changed (effecting future * associations only). * * sack_delay - This parameter contains the number of milliseconds that * the user is requesting the delayed ACK timer be set to. Note that * this value is defined in the standard to be between 200 and 500 * milliseconds. * * sack_freq - This parameter contains the number of packets that must * be received before a sack is sent without waiting for the delay * timer to expire. The default value for this is 2, setting this * value to 1 will disable the delayed sack algorithm. */ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sack_info params; struct sctp_association *asoc = NULL; struct sctp_sock *sp = sctp_sk(sk); if (len >= sizeof(struct sctp_sack_info)) { len = sizeof(struct sctp_sack_info); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else if (len == sizeof(struct sctp_assoc_value)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of struct sctp_assoc_value in delayed_ack socket option.\n" "Use struct sctp_sack_info instead\n", current->comm, task_pid_nr(current)); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else return -EINVAL; /* Get association, if sack_assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params.sack_assoc_id); if (!asoc && params.sack_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { /* Fetch association values. */ if (asoc->param_flags & SPP_SACKDELAY_ENABLE) { params.sack_delay = jiffies_to_msecs(asoc->sackdelay); params.sack_freq = asoc->sackfreq; } else { params.sack_delay = 0; params.sack_freq = 1; } } else { /* Fetch socket values. */ if (sp->param_flags & SPP_SACKDELAY_ENABLE) { params.sack_delay = sp->sackdelay; params.sack_freq = sp->sackfreq; } else { params.sack_delay = 0; params.sack_freq = 1; } } if (copy_to_user(optval, ¶ms, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association * initialization. The option name argument to setsockopt() and getsockopt() * is SCTP_INITMSG. * * Setting initialization parameters is effective only on an unconnected * socket (for UDP-style sockets only future associations are effected * by the change). With TCP-style sockets, this option is inherited by * sockets derived from a listener socket. */ static int sctp_getsockopt_initmsg(struct sock *sk, int len, char __user *optval, int __user *optlen) { if (len < sizeof(struct sctp_initmsg)) return -EINVAL; len = sizeof(struct sctp_initmsg); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &sctp_sk(sk)->initmsg, len)) return -EFAULT; return 0; } static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_association *asoc; int cnt = 0; struct sctp_getaddrs getaddrs; struct sctp_transport *from; void __user *to; union sctp_addr temp; struct sctp_sock *sp = sctp_sk(sk); int addrlen; size_t space_left; int bytes_copied; if (len < sizeof(struct sctp_getaddrs)) return -EINVAL; if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs))) return -EFAULT; /* For UDP-style sockets, id specifies the association to query. */ asoc = sctp_id2assoc(sk, getaddrs.assoc_id); if (!asoc) return -EINVAL; to = optval + offsetof(struct sctp_getaddrs, addrs); space_left = len - offsetof(struct sctp_getaddrs, addrs); list_for_each_entry(from, &asoc->peer.transport_addr_list, transports) { memcpy(&temp, &from->ipaddr, sizeof(temp)); addrlen = sctp_get_pf_specific(sk->sk_family) ->addr_to_user(sp, &temp); if (space_left < addrlen) return -ENOMEM; if (copy_to_user(to, &temp, addrlen)) return -EFAULT; to += addrlen; cnt++; space_left -= addrlen; } if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) return -EFAULT; bytes_copied = ((char __user *)to) - optval; if (put_user(bytes_copied, optlen)) return -EFAULT; return 0; } static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to, size_t space_left, int *bytes_copied) { struct sctp_sockaddr_entry *addr; union sctp_addr temp; int cnt = 0; int addrlen; struct net *net = sock_net(sk); rcu_read_lock(); list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) { if (!addr->valid) continue; if ((PF_INET == sk->sk_family) && (AF_INET6 == addr->a.sa.sa_family)) continue; if ((PF_INET6 == sk->sk_family) && inet_v6_ipv6only(sk) && (AF_INET == addr->a.sa.sa_family)) continue; memcpy(&temp, &addr->a, sizeof(temp)); if (!temp.v4.sin_port) temp.v4.sin_port = htons(port); addrlen = sctp_get_pf_specific(sk->sk_family) ->addr_to_user(sctp_sk(sk), &temp); if (space_left < addrlen) { cnt = -ENOMEM; break; } memcpy(to, &temp, addrlen); to += addrlen; cnt++; space_left -= addrlen; *bytes_copied += addrlen; } rcu_read_unlock(); return cnt; } static int sctp_getsockopt_local_addrs(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_bind_addr *bp; struct sctp_association *asoc; int cnt = 0; struct sctp_getaddrs getaddrs; struct sctp_sockaddr_entry *addr; void __user *to; union sctp_addr temp; struct sctp_sock *sp = sctp_sk(sk); int addrlen; int err = 0; size_t space_left; int bytes_copied = 0; void *addrs; void *buf; if (len < sizeof(struct sctp_getaddrs)) return -EINVAL; if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs))) return -EFAULT; /* * For UDP-style sockets, id specifies the association to query. * If the id field is set to the value '0' then the locally bound * addresses are returned without regard to any particular * association. */ if (0 == getaddrs.assoc_id) { bp = &sctp_sk(sk)->ep->base.bind_addr; } else { asoc = sctp_id2assoc(sk, getaddrs.assoc_id); if (!asoc) return -EINVAL; bp = &asoc->base.bind_addr; } to = optval + offsetof(struct sctp_getaddrs, addrs); space_left = len - offsetof(struct sctp_getaddrs, addrs); addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN); if (!addrs) return -ENOMEM; /* If the endpoint is bound to 0.0.0.0 or ::0, get the valid * addresses from the global local address list. */ if (sctp_list_single_entry(&bp->address_list)) { addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(sk, &addr->a)) { cnt = sctp_copy_laddrs(sk, bp->port, addrs, space_left, &bytes_copied); if (cnt < 0) { err = cnt; goto out; } goto copy_getaddrs; } } buf = addrs; /* Protection on the bound address list is not needed since * in the socket option context we hold a socket lock and * thus the bound address list can't change. */ list_for_each_entry(addr, &bp->address_list, list) { memcpy(&temp, &addr->a, sizeof(temp)); addrlen = sctp_get_pf_specific(sk->sk_family) ->addr_to_user(sp, &temp); if (space_left < addrlen) { err = -ENOMEM; /*fixme: right error?*/ goto out; } memcpy(buf, &temp, addrlen); buf += addrlen; bytes_copied += addrlen; cnt++; space_left -= addrlen; } copy_getaddrs: if (copy_to_user(to, addrs, bytes_copied)) { err = -EFAULT; goto out; } if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) { err = -EFAULT; goto out; } /* XXX: We should have accounted for sizeof(struct sctp_getaddrs) too, * but we can't change it anymore. */ if (put_user(bytes_copied, optlen)) err = -EFAULT; out: kfree(addrs); return err; } /* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) * * Requests that the local SCTP stack use the enclosed peer address as * the association primary. The enclosed address must be one of the * association peer's addresses. */ static int sctp_getsockopt_primary_addr(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_prim prim; struct sctp_association *asoc; struct sctp_sock *sp = sctp_sk(sk); if (len < sizeof(struct sctp_prim)) return -EINVAL; len = sizeof(struct sctp_prim); if (copy_from_user(&prim, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, prim.ssp_assoc_id); if (!asoc) return -EINVAL; if (!asoc->peer.primary_path) return -ENOTCONN; memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr, asoc->peer.primary_path->af_specific->sockaddr_len); sctp_get_pf_specific(sk->sk_family)->addr_to_user(sp, (union sctp_addr *)&prim.ssp_addr); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &prim, len)) return -EFAULT; return 0; } /* * 7.1.11 Set Adaptation Layer Indicator (SCTP_ADAPTATION_LAYER) * * Requests that the local endpoint set the specified Adaptation Layer * Indication parameter for all future INIT and INIT-ACK exchanges. */ static int sctp_getsockopt_adaptation_layer(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_setadaptation adaptation; if (len < sizeof(struct sctp_setadaptation)) return -EINVAL; len = sizeof(struct sctp_setadaptation); adaptation.ssb_adaptation_ind = sctp_sk(sk)->adaptation_ind; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &adaptation, len)) return -EFAULT; return 0; } /* * * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM) * * Applications that wish to use the sendto() system call may wish to * specify a default set of parameters that would normally be supplied * through the inclusion of ancillary data. This socket option allows * such an application to set the default sctp_sndrcvinfo structure. * The application that wishes to use this socket option simply passes * in to this call the sctp_sndrcvinfo structure defined in Section * 5.2.2) The input parameters accepted by this call include * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context, * sinfo_timetolive. The user must provide the sinfo_assoc_id field in * to this call if the caller is using the UDP model. * * For getsockopt, it get the default sctp_sndrcvinfo structure. */ static int sctp_getsockopt_default_send_param(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; struct sctp_sndrcvinfo info; if (len < sizeof(info)) return -EINVAL; len = sizeof(info); if (copy_from_user(&info, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); if (!asoc && info.sinfo_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { info.sinfo_stream = asoc->default_stream; info.sinfo_flags = asoc->default_flags; info.sinfo_ppid = asoc->default_ppid; info.sinfo_context = asoc->default_context; info.sinfo_timetolive = asoc->default_timetolive; } else { info.sinfo_stream = sp->default_stream; info.sinfo_flags = sp->default_flags; info.sinfo_ppid = sp->default_ppid; info.sinfo_context = sp->default_context; info.sinfo_timetolive = sp->default_timetolive; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &info, len)) return -EFAULT; return 0; } /* RFC6458, Section 8.1.31. Set/get Default Send Parameters * (SCTP_DEFAULT_SNDINFO) */ static int sctp_getsockopt_default_sndinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; struct sctp_sndinfo info; if (len < sizeof(info)) return -EINVAL; len = sizeof(info); if (copy_from_user(&info, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, info.snd_assoc_id); if (!asoc && info.snd_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { info.snd_sid = asoc->default_stream; info.snd_flags = asoc->default_flags; info.snd_ppid = asoc->default_ppid; info.snd_context = asoc->default_context; } else { info.snd_sid = sp->default_stream; info.snd_flags = sp->default_flags; info.snd_ppid = sp->default_ppid; info.snd_context = sp->default_context; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &info, len)) return -EFAULT; return 0; } /* * * 7.1.5 SCTP_NODELAY * * Turn on/off any Nagle-like algorithm. This means that packets are * generally sent as soon as possible and no unnecessary delays are * introduced, at the cost of more packets in the network. Expects an * integer boolean flag. */ static int sctp_getsockopt_nodelay(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = (sctp_sk(sk)->nodelay == 1); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * * 7.1.1 SCTP_RTOINFO * * The protocol parameters used to initialize and bound retransmission * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access * and modify these parameters. * All parameters are time values, in milliseconds. A value of 0, when * modifying the parameters, indicates that the current value should not * be changed. * */ static int sctp_getsockopt_rtoinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_rtoinfo rtoinfo; struct sctp_association *asoc; if (len < sizeof (struct sctp_rtoinfo)) return -EINVAL; len = sizeof(struct sctp_rtoinfo); if (copy_from_user(&rtoinfo, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id); if (!asoc && rtoinfo.srto_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Values corresponding to the specific association. */ if (asoc) { rtoinfo.srto_initial = jiffies_to_msecs(asoc->rto_initial); rtoinfo.srto_max = jiffies_to_msecs(asoc->rto_max); rtoinfo.srto_min = jiffies_to_msecs(asoc->rto_min); } else { /* Values corresponding to the endpoint. */ struct sctp_sock *sp = sctp_sk(sk); rtoinfo.srto_initial = sp->rtoinfo.srto_initial; rtoinfo.srto_max = sp->rtoinfo.srto_max; rtoinfo.srto_min = sp->rtoinfo.srto_min; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &rtoinfo, len)) return -EFAULT; return 0; } /* * * 7.1.2 SCTP_ASSOCINFO * * This option is used to tune the maximum retransmission attempts * of the association. * Returns an error if the new association retransmission value is * greater than the sum of the retransmission value of the peer. * See [SCTP] for more information. * */ static int sctp_getsockopt_associnfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assocparams assocparams; struct sctp_association *asoc; struct list_head *pos; int cnt = 0; if (len < sizeof (struct sctp_assocparams)) return -EINVAL; len = sizeof(struct sctp_assocparams); if (copy_from_user(&assocparams, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id); if (!asoc && assocparams.sasoc_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; /* Values correspoinding to the specific association */ if (asoc) { assocparams.sasoc_asocmaxrxt = asoc->max_retrans; assocparams.sasoc_peer_rwnd = asoc->peer.rwnd; assocparams.sasoc_local_rwnd = asoc->a_rwnd; assocparams.sasoc_cookie_life = ktime_to_ms(asoc->cookie_life); list_for_each(pos, &asoc->peer.transport_addr_list) { cnt++; } assocparams.sasoc_number_peer_destinations = cnt; } else { /* Values corresponding to the endpoint */ struct sctp_sock *sp = sctp_sk(sk); assocparams.sasoc_asocmaxrxt = sp->assocparams.sasoc_asocmaxrxt; assocparams.sasoc_peer_rwnd = sp->assocparams.sasoc_peer_rwnd; assocparams.sasoc_local_rwnd = sp->assocparams.sasoc_local_rwnd; assocparams.sasoc_cookie_life = sp->assocparams.sasoc_cookie_life; assocparams.sasoc_number_peer_destinations = sp->assocparams. sasoc_number_peer_destinations; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &assocparams, len)) return -EFAULT; return 0; } /* * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR) * * This socket option is a boolean flag which turns on or off mapped V4 * addresses. If this option is turned on and the socket is type * PF_INET6, then IPv4 addresses will be mapped to V6 representation. * If this option is turned off, then no mapping will be done of V4 * addresses and a user will receive both PF_INET6 and PF_INET type * addresses on the socket. */ static int sctp_getsockopt_mappedv4(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; struct sctp_sock *sp = sctp_sk(sk); if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = sp->v4mapped; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 7.1.29. Set or Get the default context (SCTP_CONTEXT) * (chapter and verse is quoted at sctp_setsockopt_context()) */ static int sctp_getsockopt_context(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; if (len < sizeof(struct sctp_assoc_value)) return -EINVAL; len = sizeof(struct sctp_assoc_value); if (copy_from_user(¶ms, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; params.assoc_value = asoc ? asoc->default_rcv_context : sctp_sk(sk)->default_rcv_context; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, ¶ms, len)) return -EFAULT; return 0; } /* * 8.1.16. Get or Set the Maximum Fragmentation Size (SCTP_MAXSEG) * This option will get or set the maximum size to put in any outgoing * SCTP DATA chunk. If a message is larger than this size it will be * fragmented by SCTP into the specified size. Note that the underlying * SCTP implementation may fragment into smaller sized chunks when the * PMTU of the underlying association is smaller than the value set by * the user. The default value for this option is '0' which indicates * the user is NOT limiting fragmentation and only the PMTU will effect * SCTP's choice of DATA chunk size. Note also that values set larger * than the maximum size of an IP datagram will effectively let SCTP * control fragmentation (i.e. the same as setting this option to 0). * * The following structure is used to access and modify this parameter: * * struct sctp_assoc_value { * sctp_assoc_t assoc_id; * uint32_t assoc_value; * }; * * assoc_id: This parameter is ignored for one-to-one style sockets. * For one-to-many style sockets this parameter indicates which * association the user is performing an action upon. Note that if * this field's value is zero then the endpoints default value is * changed (effecting future associations only). * assoc_value: This parameter specifies the maximum size in bytes. */ static int sctp_getsockopt_maxseg(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; if (len == sizeof(int)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of int in maxseg socket option.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); params.assoc_id = SCTP_FUTURE_ASSOC; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else return -EINVAL; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) params.assoc_value = asoc->frag_point; else params.assoc_value = sctp_sk(sk)->user_frag; if (put_user(len, optlen)) return -EFAULT; if (len == sizeof(int)) { if (copy_to_user(optval, ¶ms.assoc_value, len)) return -EFAULT; } else { if (copy_to_user(optval, ¶ms, len)) return -EFAULT; } return 0; } /* * 7.1.24. Get or set fragmented interleave (SCTP_FRAGMENT_INTERLEAVE) * (chapter and verse is quoted at sctp_setsockopt_fragment_interleave()) */ static int sctp_getsockopt_fragment_interleave(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = sctp_sk(sk)->frag_interleave; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 7.1.25. Set or Get the sctp partial delivery point * (chapter and verse is quoted at sctp_setsockopt_partial_delivery_point()) */ static int sctp_getsockopt_partial_delivery_point(struct sock *sk, int len, char __user *optval, int __user *optlen) { u32 val; if (len < sizeof(u32)) return -EINVAL; len = sizeof(u32); val = sctp_sk(sk)->pd_point; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 7.1.28. Set or Get the maximum burst (SCTP_MAX_BURST) * (chapter and verse is quoted at sctp_setsockopt_maxburst()) */ static int sctp_getsockopt_maxburst(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; if (len == sizeof(int)) { pr_warn_ratelimited(DEPRECATED "%s (pid %d) " "Use of int in max_burst socket option.\n" "Use struct sctp_assoc_value instead\n", current->comm, task_pid_nr(current)); params.assoc_id = SCTP_FUTURE_ASSOC; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else return -EINVAL; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; params.assoc_value = asoc ? asoc->max_burst : sctp_sk(sk)->max_burst; if (len == sizeof(int)) { if (copy_to_user(optval, ¶ms.assoc_value, len)) return -EFAULT; } else { if (copy_to_user(optval, ¶ms, len)) return -EFAULT; } return 0; } static int sctp_getsockopt_hmac_ident(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_hmacalgo __user *p = (void __user *)optval; struct sctp_hmac_algo_param *hmacs; __u16 data_len = 0; u32 num_idents; int i; if (!ep->auth_enable) return -EACCES; hmacs = ep->auth_hmacs_list; data_len = ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr); if (len < sizeof(struct sctp_hmacalgo) + data_len) return -EINVAL; len = sizeof(struct sctp_hmacalgo) + data_len; num_idents = data_len / sizeof(u16); if (put_user(len, optlen)) return -EFAULT; if (put_user(num_idents, &p->shmac_num_idents)) return -EFAULT; for (i = 0; i < num_idents; i++) { __u16 hmacid = ntohs(hmacs->hmac_ids[i]); if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16))) return -EFAULT; } return 0; } static int sctp_getsockopt_active_key(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authkeyid val; struct sctp_association *asoc; if (len < sizeof(struct sctp_authkeyid)) return -EINVAL; len = sizeof(struct sctp_authkeyid); if (copy_from_user(&val, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, val.scact_assoc_id); if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; val.scact_keynumber = asoc->active_key_id; } else { if (!ep->auth_enable) return -EACCES; val.scact_keynumber = ep->active_key_id; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; struct sctp_chunks_param *ch; u32 num_chunks = 0; char __user *to; if (len < sizeof(struct sctp_authchunks)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; asoc = sctp_id2assoc(sk, val.gauth_assoc_id); if (!asoc) return -EINVAL; if (!asoc->peer.auth_capable) return -EACCES; ch = asoc->peer.peer_chunks; if (!ch) goto num; /* See if the user provided enough room for all the data */ num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr); if (len < num_chunks) return -EINVAL; if (copy_to_user(to, ch->chunks, num_chunks)) return -EFAULT; num: len = sizeof(struct sctp_authchunks) + num_chunks; if (put_user(len, optlen)) return -EFAULT; if (put_user(num_chunks, &p->gauth_number_of_chunks)) return -EFAULT; return 0; } static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_authchunks __user *p = (void __user *)optval; struct sctp_authchunks val; struct sctp_association *asoc; struct sctp_chunks_param *ch; u32 num_chunks = 0; char __user *to; if (len < sizeof(struct sctp_authchunks)) return -EINVAL; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; asoc = sctp_id2assoc(sk, val.gauth_assoc_id); if (!asoc && val.gauth_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { if (!asoc->peer.auth_capable) return -EACCES; ch = (struct sctp_chunks_param *)asoc->c.auth_chunks; } else { if (!ep->auth_enable) return -EACCES; ch = ep->auth_chunk_list; } if (!ch) goto num; num_chunks = ntohs(ch->param_hdr.length) - sizeof(struct sctp_paramhdr); if (len < sizeof(struct sctp_authchunks) + num_chunks) return -EINVAL; if (copy_to_user(to, ch->chunks, num_chunks)) return -EFAULT; num: len = sizeof(struct sctp_authchunks) + num_chunks; if (put_user(len, optlen)) return -EFAULT; if (put_user(num_chunks, &p->gauth_number_of_chunks)) return -EFAULT; return 0; } /* * 8.2.5. Get the Current Number of Associations (SCTP_GET_ASSOC_NUMBER) * This option gets the current number of associations that are attached * to a one-to-many style socket. The option value is an uint32_t. */ static int sctp_getsockopt_assoc_number(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; u32 val = 0; if (sctp_style(sk, TCP)) return -EOPNOTSUPP; if (len < sizeof(u32)) return -EINVAL; len = sizeof(u32); list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { val++; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 8.1.23 SCTP_AUTO_ASCONF * See the corresponding setsockopt entry as description */ static int sctp_getsockopt_auto_asconf(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val = 0; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); if (sctp_sk(sk)->do_auto_asconf && sctp_is_ep_boundall(sk)) val = 1; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * 8.2.6. Get the Current Identifiers of Associations * (SCTP_GET_ASSOC_ID_LIST) * * This option gets the current list of SCTP association identifiers of * the SCTP associations handled by a one-to-many style socket. */ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_association *asoc; struct sctp_assoc_ids *ids; u32 num = 0; if (sctp_style(sk, TCP)) return -EOPNOTSUPP; if (len < sizeof(struct sctp_assoc_ids)) return -EINVAL; list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { num++; } if (len < sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num) return -EINVAL; len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num; ids = kmalloc(len, GFP_USER | __GFP_NOWARN); if (unlikely(!ids)) return -ENOMEM; ids->gaids_number_of_ids = num; num = 0; list_for_each_entry(asoc, &(sp->ep->asocs), asocs) { ids->gaids_assoc_id[num++] = asoc->assoc_id; } if (put_user(len, optlen) || copy_to_user(optval, ids, len)) { kfree(ids); return -EFAULT; } kfree(ids); return 0; } /* * SCTP_PEER_ADDR_THLDS * * This option allows us to fetch the partially failed threshold for one or all * transports in an association. See Section 6.1 of: * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt */ static int sctp_getsockopt_paddr_thresholds(struct sock *sk, char __user *optval, int len, int __user *optlen, bool v2) { struct sctp_paddrthlds_v2 val; struct sctp_transport *trans; struct sctp_association *asoc; int min; min = v2 ? sizeof(val) : sizeof(struct sctp_paddrthlds); if (len < min) return -EINVAL; len = min; if (copy_from_user(&val, optval, len)) return -EFAULT; if (!sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) { trans = sctp_addr_id2transport(sk, &val.spt_address, val.spt_assoc_id); if (!trans) return -ENOENT; val.spt_pathmaxrxt = trans->pathmaxrxt; val.spt_pathpfthld = trans->pf_retrans; val.spt_pathcpthld = trans->ps_retrans; goto out; } asoc = sctp_id2assoc(sk, val.spt_assoc_id); if (!asoc && val.spt_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; if (asoc) { val.spt_pathpfthld = asoc->pf_retrans; val.spt_pathmaxrxt = asoc->pathmaxrxt; val.spt_pathcpthld = asoc->ps_retrans; } else { struct sctp_sock *sp = sctp_sk(sk); val.spt_pathpfthld = sp->pf_retrans; val.spt_pathmaxrxt = sp->pathmaxrxt; val.spt_pathcpthld = sp->ps_retrans; } out: if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } /* * SCTP_GET_ASSOC_STATS * * This option retrieves local per endpoint statistics. It is modeled * after OpenSolaris' implementation */ static int sctp_getsockopt_assoc_stats(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_stats sas; struct sctp_association *asoc = NULL; /* User must provide at least the assoc id */ if (len < sizeof(sctp_assoc_t)) return -EINVAL; /* Allow the struct to grow and fill in as much as possible */ len = min_t(size_t, len, sizeof(sas)); if (copy_from_user(&sas, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, sas.sas_assoc_id); if (!asoc) return -EINVAL; sas.sas_rtxchunks = asoc->stats.rtxchunks; sas.sas_gapcnt = asoc->stats.gapcnt; sas.sas_outofseqtsns = asoc->stats.outofseqtsns; sas.sas_osacks = asoc->stats.osacks; sas.sas_isacks = asoc->stats.isacks; sas.sas_octrlchunks = asoc->stats.octrlchunks; sas.sas_ictrlchunks = asoc->stats.ictrlchunks; sas.sas_oodchunks = asoc->stats.oodchunks; sas.sas_iodchunks = asoc->stats.iodchunks; sas.sas_ouodchunks = asoc->stats.ouodchunks; sas.sas_iuodchunks = asoc->stats.iuodchunks; sas.sas_idupchunks = asoc->stats.idupchunks; sas.sas_opackets = asoc->stats.opackets; sas.sas_ipackets = asoc->stats.ipackets; /* New high max rto observed, will return 0 if not a single * RTO update took place. obs_rto_ipaddr will be bogus * in such a case */ sas.sas_maxrto = asoc->stats.max_obs_rto; memcpy(&sas.sas_obs_rto_ipaddr, &asoc->stats.obs_rto_ipaddr, sizeof(struct sockaddr_storage)); /* Mark beginning of a new observation period */ asoc->stats.max_obs_rto = asoc->rto_min; if (put_user(len, optlen)) return -EFAULT; pr_debug("%s: len:%d, assoc_id:%d\n", __func__, len, sas.sas_assoc_id); if (copy_to_user(optval, &sas, len)) return -EFAULT; return 0; } static int sctp_getsockopt_recvrcvinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val = 0; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); if (sctp_sk(sk)->recvrcvinfo) val = 1; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int sctp_getsockopt_recvnxtinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val = 0; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); if (sctp_sk(sk)->recvnxtinfo) val = 1; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int sctp_getsockopt_pr_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.prsctp_capable : sctp_sk(sk)->ep->prsctp_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_default_prinfo(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_default_prinfo info; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(info)) { retval = -EINVAL; goto out; } len = sizeof(info); if (copy_from_user(&info, optval, len)) goto out; asoc = sctp_id2assoc(sk, info.pr_assoc_id); if (!asoc && info.pr_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } if (asoc) { info.pr_policy = SCTP_PR_POLICY(asoc->default_flags); info.pr_value = asoc->default_timetolive; } else { struct sctp_sock *sp = sctp_sk(sk); info.pr_policy = SCTP_PR_POLICY(sp->default_flags); info.pr_value = sp->default_timetolive; } if (put_user(len, optlen)) goto out; if (copy_to_user(optval, &info, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_prstatus params; struct sctp_association *asoc; int policy; int retval = -EINVAL; if (len < sizeof(params)) goto out; len = sizeof(params); if (copy_from_user(¶ms, optval, len)) { retval = -EFAULT; goto out; } policy = params.sprstat_policy; if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc) goto out; if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { params.sprstat_abandoned_unsent += asoc->abandoned_unsent[policy]; params.sprstat_abandoned_sent += asoc->abandoned_sent[policy]; } } else { params.sprstat_abandoned_unsent = asoc->abandoned_unsent[__SCTP_PR_INDEX(policy)]; params.sprstat_abandoned_sent = asoc->abandoned_sent[__SCTP_PR_INDEX(policy)]; } if (put_user(len, optlen)) { retval = -EFAULT; goto out; } if (copy_to_user(optval, ¶ms, len)) { retval = -EFAULT; goto out; } retval = 0; out: return retval; } static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_stream_out_ext *streamoute; struct sctp_association *asoc; struct sctp_prstatus params; int retval = -EINVAL; int policy; if (len < sizeof(params)) goto out; len = sizeof(params); if (copy_from_user(¶ms, optval, len)) { retval = -EFAULT; goto out; } policy = params.sprstat_policy; if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL)) || ((policy & SCTP_PR_SCTP_ALL) && (policy & SCTP_PR_SCTP_MASK))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; streamoute = SCTP_SO(&asoc->stream, params.sprstat_sid)->ext; if (!streamoute) { /* Not allocated yet, means all stats are 0 */ params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; retval = 0; goto out; } if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { params.sprstat_abandoned_unsent += streamoute->abandoned_unsent[policy]; params.sprstat_abandoned_sent += streamoute->abandoned_sent[policy]; } } else { params.sprstat_abandoned_unsent = streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)]; params.sprstat_abandoned_sent = streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)]; } if (put_user(len, optlen) || copy_to_user(optval, ¶ms, len)) { retval = -EFAULT; goto out; } retval = 0; out: return retval; } static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.reconf_capable : sctp_sk(sk)->ep->reconf_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_enable_strreset(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->strreset_enable : sctp_sk(sk)->ep->strreset_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_scheduler(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? sctp_sched_get_sched(asoc) : sctp_sk(sk)->default_ss; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_scheduler_value(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_stream_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc) { retval = -EINVAL; goto out; } retval = sctp_sched_get_value(asoc, params.stream_id, ¶ms.stream_value); if (retval) goto out; if (put_user(len, optlen)) { retval = -EFAULT; goto out; } if (copy_to_user(optval, ¶ms, len)) { retval = -EFAULT; goto out; } out: return retval; } static int sctp_getsockopt_interleaving_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.intl_capable : sctp_sk(sk)->ep->intl_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_reuse_port(struct sock *sk, int len, char __user *optval, int __user *optlen) { int val; if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = sctp_sk(sk)->reuse; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int sctp_getsockopt_event(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_association *asoc; struct sctp_event param; __u16 subscribe; if (len < sizeof(param)) return -EINVAL; len = sizeof(param); if (copy_from_user(¶m, optval, len)) return -EFAULT; if (param.se_type < SCTP_SN_TYPE_BASE || param.se_type > SCTP_SN_TYPE_MAX) return -EINVAL; asoc = sctp_id2assoc(sk, param.se_assoc_id); if (!asoc && param.se_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) return -EINVAL; subscribe = asoc ? asoc->subscribe : sctp_sk(sk)->subscribe; param.se_on = sctp_ulpevent_type_enabled(subscribe, param.se_type); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, ¶m, len)) return -EFAULT; return 0; } static int sctp_getsockopt_asconf_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.asconf_capable : sctp_sk(sk)->ep->asconf_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_auth_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.auth_capable : sctp_sk(sk)->ep->auth_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_ecn_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->peer.ecn_capable : sctp_sk(sk)->ep->ecn_enable; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_pf_expose(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_assoc_value params; struct sctp_association *asoc; int retval = -EFAULT; if (len < sizeof(params)) { retval = -EINVAL; goto out; } len = sizeof(params); if (copy_from_user(¶ms, optval, len)) goto out; asoc = sctp_id2assoc(sk, params.assoc_id); if (!asoc && params.assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { retval = -EINVAL; goto out; } params.assoc_value = asoc ? asoc->pf_expose : sctp_sk(sk)->pf_expose; if (put_user(len, optlen)) goto out; if (copy_to_user(optval, ¶ms, len)) goto out; retval = 0; out: return retval; } static int sctp_getsockopt_encap_port(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_association *asoc; struct sctp_udpencaps encap; struct sctp_transport *t; __be16 encap_port; if (len < sizeof(encap)) return -EINVAL; len = sizeof(encap); if (copy_from_user(&encap, optval, len)) return -EFAULT; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)&encap.sue_address)) { t = sctp_addr_id2transport(sk, &encap.sue_address, encap.sue_assoc_id); if (!t) { pr_debug("%s: failed no transport\n", __func__); return -EINVAL; } encap_port = t->encap_port; goto out; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, encap.sue_assoc_id); if (!asoc && encap.sue_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { pr_debug("%s: failed no association\n", __func__); return -EINVAL; } if (asoc) { encap_port = asoc->encap_port; goto out; } encap_port = sctp_sk(sk)->encap_port; out: encap.sue_port = (__force uint16_t)encap_port; if (copy_to_user(optval, &encap, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } static int sctp_getsockopt_probe_interval(struct sock *sk, int len, char __user *optval, int __user *optlen) { struct sctp_probeinterval params; struct sctp_association *asoc; struct sctp_transport *t; __u32 probe_interval; if (len < sizeof(params)) return -EINVAL; len = sizeof(params); if (copy_from_user(¶ms, optval, len)) return -EFAULT; /* If an address other than INADDR_ANY is specified, and * no transport is found, then the request is invalid. */ if (!sctp_is_any(sk, (union sctp_addr *)¶ms.spi_address)) { t = sctp_addr_id2transport(sk, ¶ms.spi_address, params.spi_assoc_id); if (!t) { pr_debug("%s: failed no transport\n", __func__); return -EINVAL; } probe_interval = jiffies_to_msecs(t->probe_interval); goto out; } /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the * socket is a one to many style socket, and an association * was not found, then the id was invalid. */ asoc = sctp_id2assoc(sk, params.spi_assoc_id); if (!asoc && params.spi_assoc_id != SCTP_FUTURE_ASSOC && sctp_style(sk, UDP)) { pr_debug("%s: failed no association\n", __func__); return -EINVAL; } if (asoc) { probe_interval = jiffies_to_msecs(asoc->probe_interval); goto out; } probe_interval = sctp_sk(sk)->probe_interval; out: params.spi_interval = probe_interval; if (copy_to_user(optval, ¶ms, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int retval = 0; int len; pr_debug("%s: sk:%p, optname:%d\n", __func__, sk, optname); /* I can hardly begin to describe how wrong this is. This is * so broken as to be worse than useless. The API draft * REALLY is NOT helpful here... I am not convinced that the * semantics of getsockopt() with a level OTHER THAN SOL_SCTP * are at all well-founded. */ if (level != SOL_SCTP) { struct sctp_af *af = sctp_sk(sk)->pf->af; retval = af->getsockopt(sk, level, optname, optval, optlen); return retval; } if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; lock_sock(sk); switch (optname) { case SCTP_STATUS: retval = sctp_getsockopt_sctp_status(sk, len, optval, optlen); break; case SCTP_DISABLE_FRAGMENTS: retval = sctp_getsockopt_disable_fragments(sk, len, optval, optlen); break; case SCTP_EVENTS: retval = sctp_getsockopt_events(sk, len, optval, optlen); break; case SCTP_AUTOCLOSE: retval = sctp_getsockopt_autoclose(sk, len, optval, optlen); break; case SCTP_SOCKOPT_PEELOFF: retval = sctp_getsockopt_peeloff(sk, len, optval, optlen); break; case SCTP_SOCKOPT_PEELOFF_FLAGS: retval = sctp_getsockopt_peeloff_flags(sk, len, optval, optlen); break; case SCTP_PEER_ADDR_PARAMS: retval = sctp_getsockopt_peer_addr_params(sk, len, optval, optlen); break; case SCTP_DELAYED_SACK: retval = sctp_getsockopt_delayed_ack(sk, len, optval, optlen); break; case SCTP_INITMSG: retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); break; case SCTP_GET_PEER_ADDRS: retval = sctp_getsockopt_peer_addrs(sk, len, optval, optlen); break; case SCTP_GET_LOCAL_ADDRS: retval = sctp_getsockopt_local_addrs(sk, len, optval, optlen); break; case SCTP_SOCKOPT_CONNECTX3: retval = sctp_getsockopt_connectx3(sk, len, optval, optlen); break; case SCTP_DEFAULT_SEND_PARAM: retval = sctp_getsockopt_default_send_param(sk, len, optval, optlen); break; case SCTP_DEFAULT_SNDINFO: retval = sctp_getsockopt_default_sndinfo(sk, len, optval, optlen); break; case SCTP_PRIMARY_ADDR: retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen); break; case SCTP_NODELAY: retval = sctp_getsockopt_nodelay(sk, len, optval, optlen); break; case SCTP_RTOINFO: retval = sctp_getsockopt_rtoinfo(sk, len, optval, optlen); break; case SCTP_ASSOCINFO: retval = sctp_getsockopt_associnfo(sk, len, optval, optlen); break; case SCTP_I_WANT_MAPPED_V4_ADDR: retval = sctp_getsockopt_mappedv4(sk, len, optval, optlen); break; case SCTP_MAXSEG: retval = sctp_getsockopt_maxseg(sk, len, optval, optlen); break; case SCTP_GET_PEER_ADDR_INFO: retval = sctp_getsockopt_peer_addr_info(sk, len, optval, optlen); break; case SCTP_ADAPTATION_LAYER: retval = sctp_getsockopt_adaptation_layer(sk, len, optval, optlen); break; case SCTP_CONTEXT: retval = sctp_getsockopt_context(sk, len, optval, optlen); break; case SCTP_FRAGMENT_INTERLEAVE: retval = sctp_getsockopt_fragment_interleave(sk, len, optval, optlen); break; case SCTP_PARTIAL_DELIVERY_POINT: retval = sctp_getsockopt_partial_delivery_point(sk, len, optval, optlen); break; case SCTP_MAX_BURST: retval = sctp_getsockopt_maxburst(sk, len, optval, optlen); break; case SCTP_AUTH_KEY: case SCTP_AUTH_CHUNK: case SCTP_AUTH_DELETE_KEY: case SCTP_AUTH_DEACTIVATE_KEY: retval = -EOPNOTSUPP; break; case SCTP_HMAC_IDENT: retval = sctp_getsockopt_hmac_ident(sk, len, optval, optlen); break; case SCTP_AUTH_ACTIVE_KEY: retval = sctp_getsockopt_active_key(sk, len, optval, optlen); break; case SCTP_PEER_AUTH_CHUNKS: retval = sctp_getsockopt_peer_auth_chunks(sk, len, optval, optlen); break; case SCTP_LOCAL_AUTH_CHUNKS: retval = sctp_getsockopt_local_auth_chunks(sk, len, optval, optlen); break; case SCTP_GET_ASSOC_NUMBER: retval = sctp_getsockopt_assoc_number(sk, len, optval, optlen); break; case SCTP_GET_ASSOC_ID_LIST: retval = sctp_getsockopt_assoc_ids(sk, len, optval, optlen); break; case SCTP_AUTO_ASCONF: retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen); break; case SCTP_PEER_ADDR_THLDS: retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen, false); break; case SCTP_PEER_ADDR_THLDS_V2: retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen, true); break; case SCTP_GET_ASSOC_STATS: retval = sctp_getsockopt_assoc_stats(sk, len, optval, optlen); break; case SCTP_RECVRCVINFO: retval = sctp_getsockopt_recvrcvinfo(sk, len, optval, optlen); break; case SCTP_RECVNXTINFO: retval = sctp_getsockopt_recvnxtinfo(sk, len, optval, optlen); break; case SCTP_PR_SUPPORTED: retval = sctp_getsockopt_pr_supported(sk, len, optval, optlen); break; case SCTP_DEFAULT_PRINFO: retval = sctp_getsockopt_default_prinfo(sk, len, optval, optlen); break; case SCTP_PR_ASSOC_STATUS: retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, optlen); break; case SCTP_PR_STREAM_STATUS: retval = sctp_getsockopt_pr_streamstatus(sk, len, optval, optlen); break; case SCTP_RECONFIG_SUPPORTED: retval = sctp_getsockopt_reconfig_supported(sk, len, optval, optlen); break; case SCTP_ENABLE_STREAM_RESET: retval = sctp_getsockopt_enable_strreset(sk, len, optval, optlen); break; case SCTP_STREAM_SCHEDULER: retval = sctp_getsockopt_scheduler(sk, len, optval, optlen); break; case SCTP_STREAM_SCHEDULER_VALUE: retval = sctp_getsockopt_scheduler_value(sk, len, optval, optlen); break; case SCTP_INTERLEAVING_SUPPORTED: retval = sctp_getsockopt_interleaving_supported(sk, len, optval, optlen); break; case SCTP_REUSE_PORT: retval = sctp_getsockopt_reuse_port(sk, len, optval, optlen); break; case SCTP_EVENT: retval = sctp_getsockopt_event(sk, len, optval, optlen); break; case SCTP_ASCONF_SUPPORTED: retval = sctp_getsockopt_asconf_supported(sk, len, optval, optlen); break; case SCTP_AUTH_SUPPORTED: retval = sctp_getsockopt_auth_supported(sk, len, optval, optlen); break; case SCTP_ECN_SUPPORTED: retval = sctp_getsockopt_ecn_supported(sk, len, optval, optlen); break; case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE: retval = sctp_getsockopt_pf_expose(sk, len, optval, optlen); break; case SCTP_REMOTE_UDP_ENCAPS_PORT: retval = sctp_getsockopt_encap_port(sk, len, optval, optlen); break; case SCTP_PLPMTUD_PROBE_INTERVAL: retval = sctp_getsockopt_probe_interval(sk, len, optval, optlen); break; default: retval = -ENOPROTOOPT; break; } release_sock(sk); return retval; } static bool sctp_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SCTP) { switch (optname) { case SCTP_SOCKOPT_PEELOFF: case SCTP_SOCKOPT_PEELOFF_FLAGS: case SCTP_SOCKOPT_CONNECTX3: return true; default: return false; } } return false; } static int sctp_hash(struct sock *sk) { /* STUB */ return 0; } static void sctp_unhash(struct sock *sk) { /* STUB */ } /* Check if port is acceptable. Possibly find first available port. * * The port hash table (contained in the 'global' SCTP protocol storage * returned by struct sctp_protocol *sctp_get_protocol()). The hash * table is an array of 4096 lists (sctp_bind_hashbucket). Each * list (the list number is the port number hashed out, so as you * would expect from a hash function, all the ports in a given list have * such a number that hashes out to the same list number; you were * expecting that, right?); so each list has a set of ports, with a * link to the socket (struct sock) that uses it, the port number and * a fastreuse flag (FIXME: NPI ipg). */ static struct sctp_bind_bucket *sctp_bucket_create( struct sctp_bind_hashbucket *head, struct net *, unsigned short snum); static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) { struct sctp_sock *sp = sctp_sk(sk); bool reuse = (sk->sk_reuse || sp->reuse); struct sctp_bind_hashbucket *head; /* hash list */ struct net *net = sock_net(sk); kuid_t uid = sock_i_uid(sk); struct sctp_bind_bucket *pp; unsigned short snum; int ret; snum = ntohs(addr->v4.sin_port); pr_debug("%s: begins, snum:%d\n", __func__, snum); if (snum == 0) { /* Search for an available port. */ int low, high, remaining, index; unsigned int rover; inet_sk_get_local_port_range(sk, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_below(remaining) + low; do { rover++; if ((rover < low) || (rover > high)) rover = low; if (inet_is_local_reserved_port(net, rover)) continue; index = sctp_phashfn(net, rover); head = &sctp_port_hashtable[index]; spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) if ((pp->port == rover) && net_eq(net, pp->net)) goto next; break; next: spin_unlock_bh(&head->lock); cond_resched(); } while (--remaining > 0); /* Exhausted local port range during search? */ ret = 1; if (remaining <= 0) return ret; /* OK, here is the one we will use. HEAD (the port * hash table list entry) is non-NULL and we hold it's * mutex. */ snum = rover; } else { /* We are given an specific port number; we verify * that it is not being used. If it is used, we will * exahust the search in the hash list corresponding * to the port number (snum) - we detect that with the * port iterator, pp being NULL. */ head = &sctp_port_hashtable[sctp_phashfn(net, snum)]; spin_lock_bh(&head->lock); sctp_for_each_hentry(pp, &head->chain) { if ((pp->port == snum) && net_eq(pp->net, net)) goto pp_found; } } pp = NULL; goto pp_not_found; pp_found: if (!hlist_empty(&pp->owner)) { /* We had a port hash table hit - there is an * available port (pp != NULL) and it is being * used by other socket (pp->owner not empty); that other * socket is going to be sk2. */ struct sock *sk2; pr_debug("%s: found a possible match\n", __func__); if ((pp->fastreuse && reuse && sk->sk_state != SCTP_SS_LISTENING) || (pp->fastreuseport && sk->sk_reuseport && uid_eq(pp->fastuid, uid))) goto success; /* Run through the list of sockets bound to the port * (pp->port) [via the pointers bind_next and * bind_pprev in the struct sock *sk2 (pp->sk)]. On each one, * we get the endpoint they describe and run through * the endpoint's list of IP (v4 or v6) addresses, * comparing each of the addresses with the address of * the socket sk. If we find a match, then that means * that this port/socket (sk) combination are already * in an endpoint. */ sk_for_each_bound(sk2, &pp->owner) { int bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); struct sctp_sock *sp2 = sctp_sk(sk2); struct sctp_endpoint *ep2 = sp2->ep; if (sk == sk2 || (reuse && (sk2->sk_reuse || sp2->reuse) && sk2->sk_state != SCTP_SS_LISTENING) || (sk->sk_reuseport && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)))) continue; if ((!sk->sk_bound_dev_if || !bound_dev_if2 || sk->sk_bound_dev_if == bound_dev_if2) && sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, sp2, sp)) { ret = 1; goto fail_unlock; } } pr_debug("%s: found a match\n", __func__); } pp_not_found: /* If there was a hash table miss, create a new port. */ ret = 1; if (!pp && !(pp = sctp_bucket_create(head, net, snum))) goto fail_unlock; /* In either case (hit or miss), make sure fastreuse is 1 only * if sk->sk_reuse is too (that is, if the caller requested * SO_REUSEADDR on this socket -sk-). */ if (hlist_empty(&pp->owner)) { if (reuse && sk->sk_state != SCTP_SS_LISTENING) pp->fastreuse = 1; else pp->fastreuse = 0; if (sk->sk_reuseport) { pp->fastreuseport = 1; pp->fastuid = uid; } else { pp->fastreuseport = 0; } } else { if (pp->fastreuse && (!reuse || sk->sk_state == SCTP_SS_LISTENING)) pp->fastreuse = 0; if (pp->fastreuseport && (!sk->sk_reuseport || !uid_eq(pp->fastuid, uid))) pp->fastreuseport = 0; } /* We are set, so fill up all the data in the hash table * entry, tie the socket list information with the rest of the * sockets FIXME: Blurry, NPI (ipg). */ success: if (!sp->bind_hash) { inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &pp->owner); sp->bind_hash = pp; } ret = 0; fail_unlock: spin_unlock_bh(&head->lock); return ret; } /* Assign a 'snum' port to the socket. If snum == 0, an ephemeral * port is requested. */ static int sctp_get_port(struct sock *sk, unsigned short snum) { union sctp_addr addr; struct sctp_af *af = sctp_sk(sk)->pf->af; /* Set up a dummy address struct from the sk. */ af->from_sk(&addr, sk); addr.v4.sin_port = htons(snum); /* Note: sk->sk_num gets filled in if ephemeral port request. */ return sctp_get_port_local(sk, &addr); } /* * Move a socket to LISTENING state. */ static int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; struct crypto_shash *tfm = NULL; char alg[32]; /* Allocate HMAC for generating cookie. */ if (!sp->hmac && sp->sctp_hmac_alg) { sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg); tfm = crypto_alloc_shash(alg, 0, 0); if (IS_ERR(tfm)) { net_info_ratelimited("failed to load transform for %s: %ld\n", sp->sctp_hmac_alg, PTR_ERR(tfm)); return -ENOSYS; } sctp_sk(sk)->hmac = tfm; } /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system * picks an ephemeral port and will choose an address set equivalent * to binding with a wildcard address. * * This is not currently spelled out in the SCTP sockets * extensions draft, but follows the practice as seen in TCP * sockets. * */ inet_sk_set_state(sk, SCTP_SS_LISTENING); if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) return -EAGAIN; } else { if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { inet_sk_set_state(sk, SCTP_SS_CLOSED); return -EADDRINUSE; } } WRITE_ONCE(sk->sk_max_ack_backlog, backlog); return sctp_hash_endpoint(ep); } /* * 4.1.3 / 5.1.3 listen() * * By default, new associations are not accepted for UDP style sockets. * An application uses listen() to mark a socket as being able to * accept new associations. * * On TCP style sockets, applications use listen() to ready the SCTP * endpoint for accepting inbound associations. * * On both types of endpoints a backlog of '0' disables listening. * * Move a socket to LISTENING state. */ int sctp_inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct sctp_endpoint *ep = sctp_sk(sk)->ep; int err = -EINVAL; if (unlikely(backlog < 0)) return err; lock_sock(sk); /* Peeled-off sockets are not allowed to listen(). */ if (sctp_style(sk, UDP_HIGH_BANDWIDTH)) goto out; if (sock->state != SS_UNCONNECTED) goto out; if (!sctp_sstate(sk, LISTENING) && !sctp_sstate(sk, CLOSED)) goto out; /* If backlog is zero, disable listening. */ if (!backlog) { if (sctp_sstate(sk, CLOSED)) goto out; err = 0; sctp_unhash_endpoint(ep); sk->sk_state = SCTP_SS_CLOSED; if (sk->sk_reuse || sctp_sk(sk)->reuse) sctp_sk(sk)->bind_hash->fastreuse = 1; goto out; } /* If we are already listening, just update the backlog */ if (sctp_sstate(sk, LISTENING)) WRITE_ONCE(sk->sk_max_ack_backlog, backlog); else { err = sctp_listen_start(sk, backlog); if (err) goto out; } err = 0; out: release_sock(sk); return err; } /* * This function is done by modeling the current datagram_poll() and the * tcp_poll(). Note that, based on these implementations, we don't * lock the socket in this function, even though it seems that, * ideally, locking or some other mechanisms can be used to ensure * the integrity of the counters (sndbuf and wmem_alloc) used * in this place. We assume that we don't need locks either until proven * otherwise. * * Another thing to note is that we include the Async I/O support * here, again, by modeling the current TCP/UDP code. We don't have * a good way to test with it yet. */ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct sctp_sock *sp = sctp_sk(sk); __poll_t mask; poll_wait(file, sk_sleep(sk), wait); sock_rps_record_flow(sk); /* A TCP-style listening socket becomes readable when the accept queue * is not empty. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) return (!list_empty(&sp->ep->asocs)) ? (EPOLLIN | EPOLLRDNORM) : 0; mask = 0; /* Is there any exceptional events? */ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* Is it readable? Reconsider this code with TCP-style support. */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* The association is either gone or not ready. */ if (!sctp_style(sk, UDP) && sctp_sstate(sk, CLOSED)) return mask; /* Is it writable? */ if (sctp_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* * Since the socket is not locked, the buffer * might be made available after the writeable check and * before the bit is set. This could cause a lost I/O * signal. tcp_poll() has a race breaker for this race * condition. Based on their implementation, we put * in the following code to cover it as well. */ if (sctp_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; } return mask; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ static struct sctp_bind_bucket *sctp_bucket_create( struct sctp_bind_hashbucket *head, struct net *net, unsigned short snum) { struct sctp_bind_bucket *pp; pp = kmem_cache_alloc(sctp_bucket_cachep, GFP_ATOMIC); if (pp) { SCTP_DBG_OBJCNT_INC(bind_bucket); pp->port = snum; pp->fastreuse = 0; INIT_HLIST_HEAD(&pp->owner); pp->net = net; hlist_add_head(&pp->node, &head->chain); } return pp; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ static void sctp_bucket_destroy(struct sctp_bind_bucket *pp) { if (pp && hlist_empty(&pp->owner)) { __hlist_del(&pp->node); kmem_cache_free(sctp_bucket_cachep, pp); SCTP_DBG_OBJCNT_DEC(bind_bucket); } } /* Release this socket's reference to a local port. */ static inline void __sctp_put_port(struct sock *sk) { struct sctp_bind_hashbucket *head = &sctp_port_hashtable[sctp_phashfn(sock_net(sk), inet_sk(sk)->inet_num)]; struct sctp_bind_bucket *pp; spin_lock(&head->lock); pp = sctp_sk(sk)->bind_hash; __sk_del_bind_node(sk); sctp_sk(sk)->bind_hash = NULL; inet_sk(sk)->inet_num = 0; sctp_bucket_destroy(pp); spin_unlock(&head->lock); } void sctp_put_port(struct sock *sk) { local_bh_disable(); __sctp_put_port(sk); local_bh_enable(); } /* * The system picks an ephemeral port and choose an address set equivalent * to binding with a wildcard address. * One of those addresses will be the primary address for the association. * This automatically enables the multihoming capability of SCTP. */ static int sctp_autobind(struct sock *sk) { union sctp_addr autoaddr; struct sctp_af *af; __be16 port; /* Initialize a local sockaddr structure to INADDR_ANY. */ af = sctp_sk(sk)->pf->af; port = htons(inet_sk(sk)->inet_num); af->inaddr_any(&autoaddr, port); return sctp_do_bind(sk, &autoaddr, af->sockaddr_len); } /* Parse out IPPROTO_SCTP CMSG headers. Perform only minimal validation. * * From RFC 2292 * 4.2 The cmsghdr Structure * * * When ancillary data is sent or received, any number of ancillary data * objects can be specified by the msg_control and msg_controllen members of * the msghdr structure, because each object is preceded by * a cmsghdr structure defining the object's length (the cmsg_len member). * Historically Berkeley-derived implementations have passed only one object * at a time, but this API allows multiple objects to be * passed in a single call to sendmsg() or recvmsg(). The following example * shows two ancillary data objects in a control buffer. * * |<--------------------------- msg_controllen -------------------------->| * | | * * |<----- ancillary data object ----->|<----- ancillary data object ----->| * * |<---------- CMSG_SPACE() --------->|<---------- CMSG_SPACE() --------->| * | | | * * |<---------- cmsg_len ---------->| |<--------- cmsg_len ----------->| | * * |<--------- CMSG_LEN() --------->| |<-------- CMSG_LEN() ---------->| | * | | | | | * * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+ * |cmsg_|cmsg_|cmsg_|XX| |XX|cmsg_|cmsg_|cmsg_|XX| |XX| * * |len |level|type |XX|cmsg_data[]|XX|len |level|type |XX|cmsg_data[]|XX| * * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+ * ^ * | * * msg_control * points here */ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) { struct msghdr *my_msg = (struct msghdr *)msg; struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, my_msg) { if (!CMSG_OK(my_msg, cmsg)) return -EINVAL; /* Should we parse this header or ignore? */ if (cmsg->cmsg_level != IPPROTO_SCTP) continue; /* Strictly check lengths following example in SCM code. */ switch (cmsg->cmsg_type) { case SCTP_INIT: /* SCTP Socket API Extension * 5.3.1 SCTP Initiation Structure (SCTP_INIT) * * This cmsghdr structure provides information for * initializing new SCTP associations with sendmsg(). * The SCTP_INITMSG socket option uses this same data * structure. This structure is not used for * recvmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_initmsg))) return -EINVAL; cmsgs->init = CMSG_DATA(cmsg); break; case SCTP_SNDRCV: /* SCTP Socket API Extension * 5.3.2 SCTP Header Information Structure(SCTP_SNDRCV) * * This cmsghdr structure specifies SCTP options for * sendmsg() and describes SCTP header information * about a received message through recvmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ ---------------------- * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) return -EINVAL; cmsgs->srinfo = CMSG_DATA(cmsg); if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; case SCTP_SNDINFO: /* SCTP Socket API Extension * 5.3.4 SCTP Send Information Structure (SCTP_SNDINFO) * * This cmsghdr structure specifies SCTP options for * sendmsg(). This structure and SCTP_RCVINFO replaces * SCTP_SNDRCV which has been deprecated. * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_SNDINFO struct sctp_sndinfo */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_sndinfo))) return -EINVAL; cmsgs->sinfo = CMSG_DATA(cmsg); if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; case SCTP_PRINFO: /* SCTP Socket API Extension * 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO) * * This cmsghdr structure specifies SCTP options for sendmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_PRINFO struct sctp_prinfo */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_prinfo))) return -EINVAL; cmsgs->prinfo = CMSG_DATA(cmsg); if (cmsgs->prinfo->pr_policy & ~SCTP_PR_SCTP_MASK) return -EINVAL; if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) cmsgs->prinfo->pr_value = 0; break; case SCTP_AUTHINFO: /* SCTP Socket API Extension * 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO) * * This cmsghdr structure specifies SCTP options for sendmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_AUTHINFO struct sctp_authinfo */ if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_authinfo))) return -EINVAL; cmsgs->authinfo = CMSG_DATA(cmsg); break; case SCTP_DSTADDRV4: case SCTP_DSTADDRV6: /* SCTP Socket API Extension * 5.3.9/10 SCTP Destination IPv4/6 Address Structure (SCTP_DSTADDRV4/6) * * This cmsghdr structure specifies SCTP options for sendmsg(). * * cmsg_level cmsg_type cmsg_data[] * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_DSTADDRV4 struct in_addr * ------------ ------------ --------------------- * IPPROTO_SCTP SCTP_DSTADDRV6 struct in6_addr */ cmsgs->addrs_msg = my_msg; break; default: return -EINVAL; } } return 0; } /* * Wait for a packet.. * Note: This function is the same function as in core/datagram.c * with a few modifications to make lksctp work. */ static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p) { int error; DEFINE_WAIT(wait); prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); /* Socket errors? */ error = sock_error(sk); if (error) goto out; if (!skb_queue_empty(&sk->sk_receive_queue)) goto ready; /* Socket shut down? */ if (sk->sk_shutdown & RCV_SHUTDOWN) goto out; /* Sequenced packets can come disconnected. If so we report the * problem. */ error = -ENOTCONN; /* Is there a good reason to think that we may receive some data? */ if (list_empty(&sctp_sk(sk)->ep->asocs) && !sctp_sstate(sk, LISTENING)) goto out; /* Handle signals. */ if (signal_pending(current)) goto interrupted; /* Let another process have a go. Since we are going to sleep * anyway. Note: This may cause odd behaviors if the message * does not fit in the user's buffer, but this seems to be the * only way to honor MSG_DONTWAIT realistically. */ release_sock(sk); *timeo_p = schedule_timeout(*timeo_p); lock_sock(sk); ready: finish_wait(sk_sleep(sk), &wait); return 0; interrupted: error = sock_intr_errno(*timeo_p); out: finish_wait(sk_sleep(sk), &wait); *err = error; return error; } /* Receive a datagram. * Note: This is pretty much the same routine as in core/datagram.c * with a few changes to make lksctp work. */ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, int *err) { int error; struct sk_buff *skb; long timeo; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); pr_debug("%s: timeo:%ld, max:%ld\n", __func__, timeo, MAX_SCHEDULE_TIMEOUT); do { /* Again only user level code calls this function, * so nothing interrupt level * will suddenly eat the receive_queue. * * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ if (flags & MSG_PEEK) { skb = skb_peek(&sk->sk_receive_queue); if (skb) refcount_inc(&skb->users); } else { skb = __skb_dequeue(&sk->sk_receive_queue); } if (skb) return skb; /* Caller is allowed not to check sk->sk_err before calling. */ error = sock_error(sk); if (error) goto no_packet; if (sk->sk_shutdown & RCV_SHUTDOWN) break; /* User doesn't want to wait. */ error = -EAGAIN; if (!timeo) goto no_packet; } while (sctp_wait_for_packet(sk, err, &timeo) == 0); return NULL; no_packet: *err = error; return NULL; } /* If sndbuf has changed, wake up per association sndbuf waiters. */ static void __sctp_write_space(struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; if (sctp_wspace(asoc) <= 0) return; if (waitqueue_active(&asoc->wait)) wake_up_interruptible(&asoc->wait); if (sctp_writeable(sk)) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (wq) { if (waitqueue_active(&wq->wait)) wake_up_interruptible(&wq->wait); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. * We have not tested with it yet. */ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } } static void sctp_wake_up_waiters(struct sock *sk, struct sctp_association *asoc) { struct sctp_association *tmp = asoc; /* We do accounting for the sndbuf space per association, * so we only need to wake our own association. */ if (asoc->ep->sndbuf_policy) return __sctp_write_space(asoc); /* If association goes down and is just flushing its * outq, then just normally notify others. */ if (asoc->base.dead) return sctp_write_space(sk); /* Accounting for the sndbuf space is per socket, so we * need to wake up others, try to be fair and in case of * other associations, let them have a go first instead * of just doing a sctp_write_space() call. * * Note that we reach sctp_wake_up_waiters() only when * associations free up queued chunks, thus we are under * lock and the list of associations on a socket is * guaranteed not to change. */ for (tmp = list_next_entry(tmp, asocs); 1; tmp = list_next_entry(tmp, asocs)) { /* Manually skip the head element. */ if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs)) continue; /* Wake up association. */ __sctp_write_space(tmp); /* We've reached the end. */ if (tmp == asoc) break; } } /* Do accounting for the sndbuf space. * Decrement the used sndbuf space of the corresponding association by the * data size which was just transmitted(freed). */ static void sctp_wfree(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; struct sctp_association *asoc = chunk->asoc; struct sock *sk = asoc->base.sk; sk_mem_uncharge(sk, skb->truesize); sk_wmem_queued_add(sk, -(skb->truesize + sizeof(struct sctp_chunk))); asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk); WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc)); if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; /* refcnt == 2 and !list_empty mean after this release, it's * not being used anywhere, and it's time to notify userland * that this shkey can be freed if it's been deactivated. */ if (shkey->deactivated && !list_empty(&shkey->key_list) && refcount_read(&shkey->refcnt) == 2) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_auth_shkey_release(chunk->shkey); } sock_wfree(skb); sctp_wake_up_waiters(sk, asoc); sctp_association_put(asoc); } /* Do accounting for the receive space on the socket. * Accounting for the association is done in ulpevent.c * We set this as a destructor for the cloned data skbs so that * accounting is done at the correct time. */ void sctp_sock_rfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct sctp_ulpevent *event = sctp_skb2event(skb); atomic_sub(event->rmem_len, &sk->sk_rmem_alloc); /* * Mimic the behavior of sock_rfree */ sk_mem_uncharge(sk, event->rmem_len); } /* Helper function to wait for space in the sndbuf. */ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, size_t msg_len) { struct sock *sk = asoc->base.sk; long current_timeo = *timeo_p; DEFINE_WAIT(wait); int err = 0; pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); /* Increment the association's refcnt. */ sctp_association_hold(asoc); /* Wait on the association specific sndbuf space. */ for (;;) { prepare_to_wait_exclusive(&asoc->wait, &wait, TASK_INTERRUPTIBLE); if (asoc->base.dead) goto do_dead; if (!*timeo_p) goto do_nonblock; if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING) goto do_error; if (signal_pending(current)) goto do_interrupted; if ((int)msg_len <= sctp_wspace(asoc) && sk_wmem_schedule(sk, msg_len)) break; /* Let another process have a go. Since we are going * to sleep anyway. */ release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); if (sk != asoc->base.sk) goto do_error; *timeo_p = current_timeo; } out: finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ sctp_association_put(asoc); return err; do_dead: err = -ESRCH; goto out; do_error: err = -EPIPE; goto out; do_interrupted: err = sock_intr_errno(*timeo_p); goto out; do_nonblock: err = -EAGAIN; goto out; } void sctp_data_ready(struct sock *sk) { struct socket_wq *wq; trace_sk_data_ready(sk); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } /* If socket sndbuf has changed, wake up all per association waiters. */ void sctp_write_space(struct sock *sk) { struct sctp_association *asoc; /* Wake up the tasks in each wait queue. */ list_for_each_entry(asoc, &((sctp_sk(sk))->ep->asocs), asocs) { __sctp_write_space(asoc); } } /* Is there any sndbuf space available on the socket? * * Note that sk_wmem_alloc is the sum of the send buffers on all of the * associations on the same socket. For a UDP-style socket with * multiple associations, it is possible for it to be "unwriteable" * prematurely. I assume that this is acceptable because * a premature "unwriteable" is better than an accidental "writeable" which * would cause an unwanted block under certain circumstances. For the 1-1 * UDP-style sockets or TCP-style sockets, this code should work. * - Daisy */ static bool sctp_writeable(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) > READ_ONCE(sk->sk_wmem_queued); } /* Wait for an association to go into ESTABLISHED state. If timeout is 0, * returns immediately with EINPROGRESS. */ static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p) { struct sock *sk = asoc->base.sk; int err = 0; long current_timeo = *timeo_p; DEFINE_WAIT(wait); pr_debug("%s: asoc:%p, timeo:%ld\n", __func__, asoc, *timeo_p); /* Increment the association's refcnt. */ sctp_association_hold(asoc); for (;;) { prepare_to_wait_exclusive(&asoc->wait, &wait, TASK_INTERRUPTIBLE); if (!*timeo_p) goto do_nonblock; if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || asoc->base.dead) goto do_error; if (signal_pending(current)) goto do_interrupted; if (sctp_state(asoc, ESTABLISHED)) break; /* Let another process have a go. Since we are going * to sleep anyway. */ release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); *timeo_p = current_timeo; } out: finish_wait(&asoc->wait, &wait); /* Release the association's refcnt. */ sctp_association_put(asoc); return err; do_error: if (asoc->init_err_counter + 1 > asoc->max_init_attempts) err = -ETIMEDOUT; else err = -ECONNREFUSED; goto out; do_interrupted: err = sock_intr_errno(*timeo_p); goto out; do_nonblock: err = -EINPROGRESS; goto out; } static int sctp_wait_for_accept(struct sock *sk, long timeo) { struct sctp_endpoint *ep; int err = 0; DEFINE_WAIT(wait); ep = sctp_sk(sk)->ep; for (;;) { prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&ep->asocs)) { release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); } err = -EINVAL; if (!sctp_sstate(sk, LISTENING)) break; err = 0; if (!list_empty(&ep->asocs)) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = -EAGAIN; if (!timeo) break; } finish_wait(sk_sleep(sk), &wait); return err; } static void sctp_wait_for_close(struct sock *sk, long timeout) { DEFINE_WAIT(wait); do { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (list_empty(&sctp_sk(sk)->ep->asocs)) break; release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } while (!signal_pending(current) && timeout); finish_wait(sk_sleep(sk), &wait); } static void sctp_skb_set_owner_r_frag(struct sk_buff *skb, struct sock *sk) { struct sk_buff *frag; if (!skb->data_len) goto done; /* Don't forget the fragments. */ skb_walk_frags(skb, frag) sctp_skb_set_owner_r_frag(frag, sk); done: sctp_skb_set_owner_r(skb, sk); } void sctp_copy_sock(struct sock *newsk, struct sock *sk, struct sctp_association *asoc) { struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; struct sctp_sock *sp = sctp_sk(sk); newsk->sk_type = sk->sk_type; newsk->sk_bound_dev_if = sk->sk_bound_dev_if; newsk->sk_flags = sk->sk_flags; newsk->sk_tsflags = sk->sk_tsflags; newsk->sk_no_check_tx = sk->sk_no_check_tx; newsk->sk_no_check_rx = sk->sk_no_check_rx; newsk->sk_reuse = sk->sk_reuse; sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; newsk->sk_destruct = sk->sk_destruct; newsk->sk_family = sk->sk_family; newsk->sk_protocol = IPPROTO_SCTP; newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; newsk->sk_sndbuf = sk->sk_sndbuf; newsk->sk_rcvbuf = sk->sk_rcvbuf; newsk->sk_lingertime = sk->sk_lingertime; newsk->sk_rcvtimeo = sk->sk_rcvtimeo; newsk->sk_sndtimeo = sk->sk_sndtimeo; newsk->sk_rxhash = sk->sk_rxhash; newinet = inet_sk(newsk); /* Initialize sk's sport, dport, rcv_saddr and daddr for * getsockname() and getpeername() */ newinet->inet_sport = inet->inet_sport; newinet->inet_saddr = inet->inet_saddr; newinet->inet_rcv_saddr = inet->inet_rcv_saddr; newinet->inet_dport = htons(asoc->peer.port); newinet->pmtudisc = inet->pmtudisc; atomic_set(&newinet->inet_id, get_random_u16()); newinet->uc_ttl = inet->uc_ttl; inet_set_bit(MC_LOOP, newsk); newinet->mc_ttl = 1; newinet->mc_index = 0; newinet->mc_list = NULL; if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); /* Set newsk security attributes from original sk and connection * security attribute from asoc. */ security_sctp_sk_clone(asoc, sk, newsk); } static inline void sctp_copy_descendant(struct sock *sk_to, const struct sock *sk_from) { size_t ancestor_size = sizeof(struct inet_sock); ancestor_size += sk_from->sk_prot->obj_size; ancestor_size -= offsetof(struct sctp_sock, pd_lobby); __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); } /* Populate the fields of the newsk from the oldsk and migrate the assoc * and its messages to the newsk. */ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sctp_association *assoc, enum sctp_socket_type type) { struct sctp_sock *oldsp = sctp_sk(oldsk); struct sctp_sock *newsp = sctp_sk(newsk); struct sctp_bind_bucket *pp; /* hash list port iterator */ struct sctp_endpoint *newep = newsp->ep; struct sk_buff *skb, *tmp; struct sctp_ulpevent *event; struct sctp_bind_hashbucket *head; int err; /* Migrate socket buffer sizes and all the socket level options to the * new socket. */ newsk->sk_sndbuf = oldsk->sk_sndbuf; newsk->sk_rcvbuf = oldsk->sk_rcvbuf; /* Brute force copy old sctp opt. */ sctp_copy_descendant(newsk, oldsk); /* Restore the ep value that was overwritten with the above structure * copy. */ newsp->ep = newep; newsp->hmac = NULL; /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), inet_sk(oldsk)->inet_num)]; spin_lock_bh(&head->lock); pp = sctp_sk(oldsk)->bind_hash; sk_add_bind_node(newsk, &pp->owner); sctp_sk(newsk)->bind_hash = pp; inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num; spin_unlock_bh(&head->lock); /* Copy the bind_addr list from the original endpoint to the new * endpoint so that we can handle restarts properly */ err = sctp_bind_addr_dup(&newsp->ep->base.bind_addr, &oldsp->ep->base.bind_addr, GFP_KERNEL); if (err) return err; /* New ep's auth_hmacs should be set if old ep's is set, in case * that net->sctp.auth_enable has been changed to 0 by users and * new ep's auth_hmacs couldn't be set in sctp_endpoint_init(). */ if (oldsp->ep->auth_hmacs) { err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL); if (err) return err; } sctp_auto_asconf_init(newsp); /* Move any messages in the old socket's receive queue that are for the * peeled off association to the new socket's receive queue. */ sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { __skb_unlink(skb, &oldsk->sk_receive_queue); __skb_queue_tail(&newsk->sk_receive_queue, skb); sctp_skb_set_owner_r_frag(skb, newsk); } } /* Clean up any messages pending delivery due to partial * delivery. Three cases: * 1) No partial deliver; no work. * 2) Peeling off partial delivery; keep pd_lobby in new pd_lobby. * 3) Peeling off non-partial delivery; move pd_lobby to receive_queue. */ atomic_set(&sctp_sk(newsk)->pd_mode, assoc->ulpq.pd_mode); if (atomic_read(&sctp_sk(oldsk)->pd_mode)) { struct sk_buff_head *queue; /* Decide which queue to move pd_lobby skbs to. */ if (assoc->ulpq.pd_mode) { queue = &newsp->pd_lobby; } else queue = &newsk->sk_receive_queue; /* Walk through the pd_lobby, looking for skbs that * need moved to the new socket. */ sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == assoc) { __skb_unlink(skb, &oldsp->pd_lobby); __skb_queue_tail(queue, skb); sctp_skb_set_owner_r_frag(skb, newsk); } } /* Clear up any skbs waiting for the partial * delivery to finish. */ if (assoc->ulpq.pd_mode) sctp_clear_pd(oldsk, NULL); } sctp_for_each_rx_skb(assoc, newsk, sctp_skb_set_owner_r_frag); /* Set the type of socket to indicate that it is peeled off from the * original UDP-style socket or created with the accept() call on a * TCP-style socket.. */ newsp->type = type; /* Mark the new socket "in-use" by the user so that any packets * that may arrive on the association after we've moved it are * queued to the backlog. This prevents a potential race between * backlog processing on the old socket and new-packet processing * on the new socket. * * The caller has just allocated newsk so we can guarantee that other * paths won't try to lock it and then oldsk. */ lock_sock_nested(newsk, SINGLE_DEPTH_NESTING); sctp_for_each_tx_datachunk(assoc, true, sctp_clear_owner_w); sctp_assoc_migrate(assoc, newsk); sctp_for_each_tx_datachunk(assoc, false, sctp_set_owner_w); /* If the association on the newsk is already closed before accept() * is called, set RCV_SHUTDOWN flag. */ if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) { inet_sk_set_state(newsk, SCTP_SS_CLOSED); newsk->sk_shutdown |= RCV_SHUTDOWN; } else { inet_sk_set_state(newsk, SCTP_SS_ESTABLISHED); } release_sock(newsk); return 0; } /* This proto struct describes the ULP interface for SCTP. */ struct proto sctp_prot = { .name = "SCTP", .owner = THIS_MODULE, .close = sctp_close, .disconnect = sctp_disconnect, .accept = sctp_accept, .ioctl = sctp_ioctl, .init = sctp_init_sock, .destroy = sctp_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, .bpf_bypass_getsockopt = sctp_bpf_bypass_getsockopt, .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, .no_autobind = true, .obj_size = sizeof(struct sctp_sock), .useroffset = offsetof(struct sctp_sock, subscribe), .usersize = offsetof(struct sctp_sock, initmsg) - offsetof(struct sctp_sock, subscribe) + sizeof_field(struct sctp_sock, initmsg), .sysctl_mem = sysctl_sctp_mem, .sysctl_rmem = sysctl_sctp_rmem, .sysctl_wmem = sysctl_sctp_wmem, .memory_pressure = &sctp_memory_pressure, .enter_memory_pressure = sctp_enter_memory_pressure, .memory_allocated = &sctp_memory_allocated, .per_cpu_fw_alloc = &sctp_memory_per_cpu_fw_alloc, .sockets_allocated = &sctp_sockets_allocated, }; #if IS_ENABLED(CONFIG_IPV6) static void sctp_v6_destruct_sock(struct sock *sk) { sctp_destruct_common(sk); inet6_sock_destruct(sk); } static int sctp_v6_init_sock(struct sock *sk) { int ret = sctp_init_sock(sk); if (!ret) sk->sk_destruct = sctp_v6_destruct_sock; return ret; } struct proto sctpv6_prot = { .name = "SCTPv6", .owner = THIS_MODULE, .close = sctp_close, .disconnect = sctp_disconnect, .accept = sctp_accept, .ioctl = sctp_ioctl, .init = sctp_v6_init_sock, .destroy = sctp_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, .bpf_bypass_getsockopt = sctp_bpf_bypass_getsockopt, .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), .ipv6_pinfo_offset = offsetof(struct sctp6_sock, inet6), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - offsetof(struct sctp6_sock, sctp.subscribe) + sizeof_field(struct sctp6_sock, sctp.initmsg), .sysctl_mem = sysctl_sctp_mem, .sysctl_rmem = sysctl_sctp_rmem, .sysctl_wmem = sysctl_sctp_wmem, .memory_pressure = &sctp_memory_pressure, .enter_memory_pressure = sctp_enter_memory_pressure, .memory_allocated = &sctp_memory_allocated, .per_cpu_fw_alloc = &sctp_memory_per_cpu_fw_alloc, .sockets_allocated = &sctp_sockets_allocated, }; #endif /* IS_ENABLED(CONFIG_IPV6) */ |
| 7 7 7 7 7 7 7 7 7 7 10 10 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 | /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *event_ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); } static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); if (master_ndev) dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); } |
| 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SHRINKER_H #define _LINUX_SHRINKER_H #include <linux/atomic.h> #include <linux/types.h> #include <linux/refcount.h> #include <linux/completion.h> #define SHRINKER_UNIT_BITS BITS_PER_LONG /* * Bitmap and deferred work of shrinker::id corresponding to memcg-aware * shrinkers, which have elements charged to the memcg. */ struct shrinker_info_unit { atomic_long_t nr_deferred[SHRINKER_UNIT_BITS]; DECLARE_BITMAP(map, SHRINKER_UNIT_BITS); }; struct shrinker_info { struct rcu_head rcu; int map_nr_max; struct shrinker_info_unit *unit[]; }; /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. * * The 'gfpmask' refers to the allocation we are currently trying to * fulfil. */ struct shrink_control { gfp_t gfp_mask; /* current node being shrunk (for NUMA aware shrinkers) */ int nid; /* * How many objects scan_objects should scan and try to reclaim. * This is reset before every call, so it is safe for callees * to modify. */ unsigned long nr_to_scan; /* * How many objects did scan_objects process? * This defaults to nr_to_scan before every call, but the callee * should track its actual progress. */ unsigned long nr_scanned; /* current memcg being shrunk (for memcg aware shrinkers) */ struct mem_cgroup *memcg; }; #define SHRINK_STOP (~0UL) #define SHRINK_EMPTY (~0UL - 1) /* * A callback you can register to apply pressure to ageable caches. * * @count_objects should return the number of freeable items in the cache. If * there are no objects to free, it should return SHRINK_EMPTY, while 0 is * returned in cases of the number of freeable items cannot be determined * or shrinker should skip this cache for this time (e.g., their number * is below shrinkable limit). No deadlock checks should be done during the * count callback - the shrinker relies on aggregating scan counts that couldn't * be executed due to potential deadlocks to be run at a later call when the * deadlock condition is no longer pending. * * @scan_objects will only be called if @count_objects returned a non-zero * value for the number of freeable objects. The callout should scan the cache * and attempt to free items from the cache. It should then return the number * of objects freed during the scan, or SHRINK_STOP if progress cannot be made * due to potential deadlocks. If SHRINK_STOP is returned, then no further * attempts to call the @scan_objects will be made from the current reclaim * context. * * @flags determine the shrinker abilities, like numa awareness */ struct shrinker { unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc); unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); long batch; /* reclaim batch size, 0 = default */ int seeks; /* seeks to recreate an obj */ unsigned flags; /* * The reference count of this shrinker. Registered shrinker have an * initial refcount of 1, then the lookup operations are now allowed * to use it via shrinker_try_get(). Later in the unregistration step, * the initial refcount will be discarded, and will free the shrinker * asynchronously via RCU after its refcount reaches 0. */ refcount_t refcount; struct completion done; /* use to wait for refcount to reach 0 */ struct rcu_head rcu; void *private_data; /* These are for internal use */ struct list_head list; #ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; #endif #ifdef CONFIG_SHRINKER_DEBUG int debugfs_id; const char *name; struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Internal flags */ #define SHRINKER_REGISTERED BIT(0) #define SHRINKER_ALLOCATED BIT(1) /* Flags for users to use */ #define SHRINKER_NUMA_AWARE BIT(2) #define SHRINKER_MEMCG_AWARE BIT(3) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ #define SHRINKER_NONSLAB BIT(4) __printf(2, 3) struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); static inline bool shrinker_try_get(struct shrinker *shrinker) { return refcount_inc_not_zero(&shrinker->refcount); } static inline void shrinker_put(struct shrinker *shrinker) { if (refcount_dec_and_test(&shrinker->refcount)) complete(&shrinker->done); } #ifdef CONFIG_SHRINKER_DEBUG extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ static inline __printf(2, 3) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) { return 0; } #endif /* CONFIG_SHRINKER_DEBUG */ #endif /* _LINUX_SHRINKER_H */ |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 | // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2022, Intel Corporation. */ #include "protocol.h" #include "mib.h" void mptcp_free_local_addr_list(struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); if (!mptcp_pm_is_userspace(msk)) return; spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { sock_kfree_s(sk, entry, sizeof(*entry)); } } static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_pm_addr_entry *match = NULL; struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *e; bool addr_match = false; bool id_match = false; int ret = -EINVAL; bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_lock_bh(&msk->pm.lock); list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); if (addr_match && entry->addr.id == 0) entry->addr.id = e->addr.id; id_match = (e->addr.id == entry->addr.id); if (addr_match && id_match) { match = e; break; } else if (addr_match || id_match) { break; } __set_bit(e->addr.id, id_bitmap); } if (!match && !addr_match && !id_match) { /* Memory for the entry is allocated from the * sock option buffer. */ e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); if (!e) { ret = -ENOMEM; goto append_err; } *e = *entry; if (!e->addr.id) e->addr.id = find_next_zero_bit(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 1); list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); msk->pm.local_addr_used++; ret = e->addr.id; } else if (match) { ret = entry->addr.id; } append_err: spin_unlock_bh(&msk->pm.lock); return ret; } /* If the subflow is closed from the other peer (not via a * subflow destroy command then), we want to keep the entry * not to assign the same ID to another address and to be * able to send RM_ADDR after the removal of the subflow. */ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *addr) { struct mptcp_pm_addr_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { /* TODO: a refcount is needed because the entry can * be used multiple times (e.g. fullmesh mode). */ list_del_rcu(&entry->list); kfree(entry); msk->pm.local_addr_used--; return 0; } } return -EINVAL; } int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { struct mptcp_pm_addr_entry *entry, *match = NULL; spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { if (id == entry->addr.id) { match = entry; break; } } spin_unlock_bh(&msk->pm.lock); if (match) { *flags = match->flags; *ifindex = match->ifindex; } return 0; } int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); new_entry.addr = *skc; new_entry.addr.id = 0; new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; if (new_entry.addr.port == msk_sport) new_entry.addr.port = 0; return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry); } int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; u32 token_val; if (!addr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto announce_err; } err = mptcp_pm_parse_entry(addr, info, true, &addr_val); if (err < 0) { GENL_SET_ERR_MSG(info, "error parsing local address"); goto announce_err; } if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { GENL_SET_ERR_MSG(info, "invalid addr id or flags"); err = -EINVAL; goto announce_err; } err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto announce_err; } lock_sock(sk); spin_lock_bh(&msk->pm.lock); if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); mptcp_pm_nl_addr_send_ack(msk); } spin_unlock_bh(&msk->pm.lock); release_sock(sk); err = 0; announce_err: sock_put(sk); return err; } static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, struct genl_info *info) { struct mptcp_rm_list list = { .nr = 0 }; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; bool has_id_0 = false; int err = -EINVAL; lock_sock(sk); mptcp_for_each_subflow(msk, subflow) { if (subflow->local_id == 0) { has_id_0 = true; break; } } if (!has_id_0) { GENL_SET_ERR_MSG(info, "address with id 0 not found"); goto remove_err; } list.ids[list.nr++] = 0; spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); err = 0; remove_err: release_sock(sk); return err; } int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match = NULL; struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; LIST_HEAD(free_list); int err = -EINVAL; struct sock *sk; u32 token_val; u8 id_val; if (!id || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } id_val = nla_get_u8(id); token_val = nla_get_u32(token); msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto out; } if (id_val == 0) { err = mptcp_userspace_pm_remove_id_zero_address(msk, info); goto out; } lock_sock(sk); list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { if (entry->addr.id == id_val) { match = entry; break; } } if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); release_sock(sk); goto out; } list_move(&match->list, &free_list); mptcp_pm_remove_addrs(msk, &free_list); release_sock(sk); list_for_each_entry_safe(match, entry, &free_list, list) { sock_kfree_s(sk, match, sizeof(*match)); } err = 0; out: sock_put(sk); return err; } int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry local = { 0 }; struct mptcp_addr_info addr_r; struct mptcp_addr_info addr_l; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; u32 token_val; if (!laddr || !raddr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(genl_info_net(info), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto create_err; } err = mptcp_pm_parse_addr(laddr, info, &addr_l); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto create_err; } err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); goto create_err; } if (!mptcp_pm_addr_families_match(sk, &addr_l, &addr_r)) { GENL_SET_ERR_MSG(info, "families mismatch"); err = -EINVAL; goto create_err; } local.addr = addr_l; err = mptcp_userspace_pm_append_new_local_addr(msk, &local); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto create_err; } lock_sock(sk); err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); release_sock(sk); spin_lock_bh(&msk->pm.lock); if (err) mptcp_userspace_pm_delete_local_addr(msk, &local); else msk->pm.subflows++; spin_unlock_bh(&msk->pm.lock); create_err: sock_put(sk); return err; } static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, const struct mptcp_addr_info *local, const struct mptcp_addr_info *remote) { struct mptcp_subflow_context *subflow; if (local->family != remote->family) return NULL; mptcp_for_each_subflow(msk, subflow) { const struct inet_sock *issk; struct sock *ssk; ssk = mptcp_subflow_tcp_sock(subflow); if (local->family != ssk->sk_family) continue; issk = inet_sk(ssk); switch (ssk->sk_family) { case AF_INET: if (issk->inet_saddr != local->addr.s_addr || issk->inet_daddr != remote->addr.s_addr) continue; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *pinfo = inet6_sk(ssk); if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) continue; break; } #endif default: continue; } if (issk->inet_sport == local->port && issk->inet_dport == remote->port) return ssk; } return NULL; } int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_addr_info addr_l; struct mptcp_addr_info addr_r; struct mptcp_sock *msk; struct sock *sk, *ssk; int err = -EINVAL; u32 token_val; if (!laddr || !raddr || !token) { GENL_SET_ERR_MSG(info, "missing required inputs"); return err; } token_val = nla_get_u32(token); msk = mptcp_token_get_sock(genl_info_net(info), token_val); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); return err; } sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto destroy_err; } err = mptcp_pm_parse_addr(laddr, info, &addr_l); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto destroy_err; } err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); goto destroy_err; } if (addr_l.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); err = -EINVAL; goto destroy_err; } if (!addr_l.port || !addr_r.port) { GENL_SET_ERR_MSG(info, "missing local or remote port"); err = -EINVAL; goto destroy_err; } lock_sock(sk); ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); if (ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_pm_addr_entry entry = { .addr = addr_l }; spin_lock_bh(&msk->pm.lock); mptcp_userspace_pm_delete_local_addr(msk, &entry); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); mptcp_close_ssk(sk, ssk, subflow); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); err = 0; } else { err = -ESRCH; } release_sock(sk); destroy_err: sock_put(sk); return err; } int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, struct mptcp_pm_addr_entry *loc, struct mptcp_pm_addr_entry *rem, u8 bkup) { struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; u32 token_val; token_val = nla_get_u32(token); msk = mptcp_token_get_sock(net, token_val); if (!msk) return ret; sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) goto set_flags_err; if (loc->addr.family == AF_UNSPEC || rem->addr.family == AF_UNSPEC) goto set_flags_err; lock_sock(sk); ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup); release_sock(sk); set_flags_err: sock_put(sk); return ret; } |
| 113 107 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Synchronous Cryptographic Hash operations. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "hash.h" static inline struct crypto_istat_hash *shash_get_stat(struct shash_alg *alg) { return hash_get_stat(&alg->halg); } static inline int crypto_shash_errstat(struct shash_alg *alg, int err) { if (IS_ENABLED(CONFIG_CRYPTO_STATS) && err) atomic64_inc(&shash_get_stat(alg)->err_cnt); return err; } int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } EXPORT_SYMBOL_GPL(shash_no_setkey); static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg) { if (crypto_shash_alg_needs_key(alg)) crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct shash_alg *shash = crypto_shash_alg(tfm); int err; err = shash->setkey(tfm, key, keylen); if (unlikely(err)) { shash_set_needkey(tfm, shash); return err; } crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_setkey); int crypto_shash_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct shash_alg *shash = crypto_shash_alg(desc->tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) atomic64_add(len, &shash_get_stat(shash)->hash_tlen); err = shash->update(desc, data, len); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_update); int crypto_shash_final(struct shash_desc *desc, u8 *out) { struct shash_alg *shash = crypto_shash_alg(desc->tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) atomic64_inc(&shash_get_stat(shash)->hash_cnt); err = shash->final(desc, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_final); static int shash_default_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct shash_alg *shash = crypto_shash_alg(desc->tfm); return shash->update(desc, data, len) ?: shash->final(desc, out); } int crypto_shash_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_hash *istat = shash_get_stat(shash); atomic64_inc(&istat->hash_cnt); atomic64_add(len, &istat->hash_tlen); } err = shash->finup(desc, data, len, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_finup); static int shash_default_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct shash_alg *shash = crypto_shash_alg(desc->tfm); return shash->init(desc) ?: shash->finup(desc, data, len, out); } int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); int err; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) { struct crypto_istat_hash *istat = shash_get_stat(shash); atomic64_inc(&istat->hash_cnt); atomic64_add(len, &istat->hash_tlen); } if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) err = -ENOKEY; else err = shash->digest(desc, data, len, out); return crypto_shash_errstat(shash, err); } EXPORT_SYMBOL_GPL(crypto_shash_digest); int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out) { SHASH_DESC_ON_STACK(desc, tfm); int err; desc->tfm = tfm; err = crypto_shash_digest(desc, data, len, out); shash_desc_zero(desc); return err; } EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest); int crypto_shash_export(struct shash_desc *desc, void *out) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); if (shash->export) return shash->export(desc, out); memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(tfm)); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_export); int crypto_shash_import(struct shash_desc *desc, const void *in) { struct crypto_shash *tfm = desc->tfm; struct shash_alg *shash = crypto_shash_alg(tfm); if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (shash->import) return shash->import(desc, in); memcpy(shash_desc_ctx(desc), in, crypto_shash_descsize(tfm)); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_import); static void crypto_shash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); alg->exit_tfm(hash); } static int crypto_shash_init_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); int err; hash->descsize = alg->descsize; shash_set_needkey(hash, alg); if (alg->exit_tfm) tfm->exit = crypto_shash_exit_tfm; if (!alg->init_tfm) return 0; err = alg->init_tfm(hash); if (err) return err; /* ->init_tfm() may have increased the descsize. */ if (WARN_ON_ONCE(hash->descsize > HASH_MAX_DESCSIZE)) { if (alg->exit_tfm) alg->exit_tfm(hash); return -EINVAL; } return 0; } static void crypto_shash_free_instance(struct crypto_instance *inst) { struct shash_instance *shash = shash_instance(inst); shash->free(shash); } static int __maybe_unused crypto_shash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; struct shash_alg *salg = __crypto_shash_alg(alg); memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "shash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = salg->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) { struct shash_alg *salg = __crypto_shash_alg(alg); seq_printf(m, "type : shash\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", salg->digestsize); } static int __maybe_unused crypto_shash_report_stat( struct sk_buff *skb, struct crypto_alg *alg) { return crypto_hash_report_stat(skb, alg, "shash"); } const struct crypto_type crypto_shash_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_shash_init_tfm, .free = crypto_shash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_shash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_shash_report, #endif #ifdef CONFIG_CRYPTO_STATS .report_stat = crypto_shash_report_stat, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SHASH, .tfmsize = offsetof(struct crypto_shash, base), }; int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_shash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_shash); struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_shash); int crypto_has_shash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_shash); struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash) { struct crypto_tfm *tfm = crypto_shash_tfm(hash); struct shash_alg *alg = crypto_shash_alg(hash); struct crypto_shash *nhash; int err; if (!crypto_shash_alg_has_setkey(alg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init)) return ERR_PTR(-ENOSYS); nhash = crypto_clone_tfm(&crypto_shash_type, tfm); if (IS_ERR(nhash)) return nhash; nhash->descsize = hash->descsize; if (alg->clone_tfm) { err = alg->clone_tfm(nhash, hash); if (err) { crypto_free_shash(nhash); return ERR_PTR(err); } } return nhash; } EXPORT_SYMBOL_GPL(crypto_clone_shash); int hash_prepare_alg(struct hash_alg_common *alg) { struct crypto_istat_hash *istat = hash_get_stat(alg); struct crypto_alg *base = &alg->base; if (alg->digestsize > HASH_MAX_DIGESTSIZE) return -EINVAL; /* alignmask is not useful for hashes, so it is not supported. */ if (base->cra_alignmask) return -EINVAL; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; if (IS_ENABLED(CONFIG_CRYPTO_STATS)) memset(istat, 0, sizeof(*istat)); return 0; } static int shash_prepare_alg(struct shash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if (alg->descsize > HASH_MAX_DESCSIZE) return -EINVAL; if ((alg->export && !alg->import) || (alg->import && !alg->export)) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_shash_type; base->cra_flags |= CRYPTO_ALG_TYPE_SHASH; /* * Handle missing optional functions. For each one we can either * install a default here, or we can leave the pointer as NULL and check * the pointer for NULL in crypto_shash_*(), avoiding an indirect call * when the default behavior is desired. For ->finup and ->digest we * install defaults, since for optimal performance algorithms should * implement these anyway. On the other hand, for ->import and * ->export the common case and best performance comes from the simple * memcpy of the shash_desc_ctx, so when those pointers are NULL we * leave them NULL and provide the memcpy with no indirect call. */ if (!alg->finup) alg->finup = shash_default_finup; if (!alg->digest) alg->digest = shash_default_digest; if (!alg->export) alg->halg.statesize = alg->descsize; if (!alg->setkey) alg->setkey = shash_no_setkey; return 0; } int crypto_register_shash(struct shash_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = shash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_shash); void crypto_unregister_shash(struct shash_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_shash); int crypto_register_shashes(struct shash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_shash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_shash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_shashes); void crypto_unregister_shashes(struct shash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_shash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_shashes); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = shash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, shash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(shash_register_instance); void shash_free_singlespawn_instance(struct shash_instance *inst) { crypto_drop_spawn(shash_instance_ctx(inst)); kfree(inst); } EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Synchronous cryptographic hash type"); |
| 368 368 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | /* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU-based infrastructure for lightweight reader-writer locking * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> */ #ifndef _LINUX_RCU_SYNC_H_ #define _LINUX_RCU_SYNC_H_ #include <linux/wait.h> #include <linux/rcupdate.h> /* Structure to mediate between updaters and fastpath-using readers. */ struct rcu_sync { int gp_state; int gp_count; wait_queue_head_t gp_wait; struct rcu_head cb_head; }; /** * rcu_sync_is_idle() - Are readers permitted to use their fastpaths? * @rsp: Pointer to rcu_sync structure to use for synchronization * * Returns true if readers are permitted to use their fastpaths. Must be * invoked within some flavor of RCU read-side critical section. */ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) { RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), "suspicious rcu_sync_is_idle() usage"); return !READ_ONCE(rsp->gp_state); /* GP_IDLE */ } extern void rcu_sync_init(struct rcu_sync *); extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); #define __RCU_SYNC_INITIALIZER(name) { \ .gp_state = 0, \ .gp_count = 0, \ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \ } #define DEFINE_RCU_SYNC(name) \ struct rcu_sync name = __RCU_SYNC_INITIALIZER(name) #endif /* _LINUX_RCU_SYNC_H_ */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for the SMC module (socket related) * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef __SMC_H #define __SMC_H #include <linux/socket.h> #include <linux/types.h> #include <linux/compiler.h> /* __aligned */ #include <net/genetlink.h> #include <net/sock.h> #include "smc_ib.h" #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE_0 0 #define SMC_RELEASE_1 1 #define SMC_RELEASE SMC_RELEASE_1 /* the latest release version */ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ #define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ extern struct proto smc_proto; extern struct proto smc_proto6; #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif enum smc_state { /* possible states of an SMC socket */ SMC_ACTIVE = 1, SMC_INIT = 2, SMC_CLOSED = 7, SMC_LISTEN = 10, /* normal close */ SMC_PEERCLOSEWAIT1 = 20, SMC_PEERCLOSEWAIT2 = 21, SMC_APPFINCLOSEWAIT = 24, SMC_APPCLOSEWAIT1 = 22, SMC_APPCLOSEWAIT2 = 23, SMC_PEERFINCLOSEWAIT = 25, /* abnormal close */ SMC_PEERABORTWAIT = 26, SMC_PROCESSABORT = 27, }; enum smc_supplemental_features { SMC_SPF_VIRT_ISM_DEV = 0, }; #define SMC_FEATURE_MASK \ (BIT(SMC_SPF_VIRT_ISM_DEV)) struct smc_link_group; struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ union { u8 type; #if defined(__BIG_ENDIAN_BITFIELD) struct { u8 llc_version:4, llc_type:4; }; #elif defined(__LITTLE_ENDIAN_BITFIELD) struct { u8 llc_type:4, llc_version:4; }; #endif }; } __aligned(1); struct smc_cdc_conn_state_flags { #if defined(__BIG_ENDIAN_BITFIELD) u8 peer_done_writing : 1; /* Sending done indicator */ u8 peer_conn_closed : 1; /* Peer connection closed indicator */ u8 peer_conn_abort : 1; /* Abnormal close indicator */ u8 reserved : 5; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 5; u8 peer_conn_abort : 1; u8 peer_conn_closed : 1; u8 peer_done_writing : 1; #endif }; struct smc_cdc_producer_flags { #if defined(__BIG_ENDIAN_BITFIELD) u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ u8 urg_data_pending : 1; /* Urgent Data Pending */ u8 urg_data_present : 1; /* Urgent Data Present */ u8 cons_curs_upd_req : 1; /* cursor update requested */ u8 failover_validation : 1;/* message replay due to failover */ u8 reserved : 3; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 3; u8 failover_validation : 1; u8 cons_curs_upd_req : 1; u8 urg_data_present : 1; u8 urg_data_pending : 1; u8 write_blocked : 1; #endif }; /* in host byte order */ union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ struct { u16 reserved; u16 wrap; /* window wrap sequence number */ u32 count; /* cursor (= offset) part */ }; #ifdef KERNEL_HAS_ATOMIC64 atomic64_t acurs; /* for atomic processing */ #else u64 acurs; /* for atomic processing */ #endif } __aligned(8); /* in host byte order, except for flag bitfields in network byte order */ struct smc_host_cdc_msg { /* Connection Data Control message */ struct smc_wr_rx_hdr common; /* .type = 0xFE */ u8 len; /* length = 44 */ u16 seqno; /* connection seq # */ u32 token; /* alert_token */ union smc_host_cursor prod; /* producer cursor */ union smc_host_cursor cons; /* consumer cursor, * piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ u8 reserved[18]; } __aligned(8); enum smc_urg_state { SMC_URG_VALID = 1, /* data present */ SMC_URG_NOTYET = 2, /* data pending */ SMC_URG_READ = 3, /* data was already read */ }; struct smc_mark_woken { bool woken; void *key; wait_queue_entry_t wait_entry; }; struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ atomic_t peer_rmbe_space;/* remaining free bytes in peer * rmbe */ int rtoken_idx; /* idx to peer RMB rkey/addr */ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ int rmbe_size_comp; /* compressed notation */ int rmbe_update_limit; /* lower limit for consumer * cursor update */ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging * buffer for CDC msg send * .prod cf. TCP snd_nxt * .cons cf. TCP sends ack */ union smc_host_cursor local_tx_ctrl_fin; /* prod crsr - confirmed by peer */ union smc_host_cursor tx_curs_prep; /* tx - prepared data * snd_max..wmem_alloc */ union smc_host_cursor tx_curs_sent; /* tx - sent data * snd_nxt ? */ union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer * snd-wnd-begin ? */ atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe * - inc when post wqe, * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt * .cons cf. TCP snd_una */ union smc_host_cursor rx_curs_confirmed; /* confirmed to peer * source of snd_una ? */ union smc_host_cursor urg_curs; /* points at urgent byte */ enum smc_urg_state urg_state; bool urg_tx_pend; /* urgent data staged */ bool urg_rx_skip_pend; /* indicate urgent oob data * read, but previous regular * data still pending */ char urg_rx_byte; /* urgent byte */ bool tx_in_release_sock; /* flush pending tx data in * sock release_cb() */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ atomic_t splice_pending; /* number of spliced bytes * pending processing */ #ifndef KERNEL_HAS_ATOMIC64 spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ struct work_struct abort_work; /* abort the connection */ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 freed : 1; /* normal termiation */ u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); /* original data_ready fct. */ void (*clcsk_write_space)(struct sock *sk); /* original write_space fct. */ void (*clcsk_error_report)(struct sock *sk); /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ atomic_t queued_smc_hs; /* queued smc handshakes */ struct inet_connection_sock_af_ops af_ops; const struct inet_connection_sock_af_ops *ori_af_ops; /* original af ops */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close * started, waiting for unsent * data to be sent */ u8 connect_nonblock : 1; /* non-blocking connect in * flight */ struct mutex clcsock_release_lock; /* protects clcsock of a listen * socket * */ }; #define smc_sk(ptr) container_of_const(ptr, struct smc_sock, sk) static inline void smc_init_saved_callbacks(struct smc_sock *smc) { smc->clcsk_state_change = NULL; smc->clcsk_data_ready = NULL; smc->clcsk_write_space = NULL; smc->clcsk_error_report = NULL; } static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); } /* save target_cb in saved_cb, and replace target_cb with new_cb */ static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), void (*new_cb)(struct sock *), void (**saved_cb)(struct sock *)) { /* only save once */ if (!*saved_cb) *saved_cb = *target_cb; *target_cb = new_cb; } /* restore target_cb to saved_cb, and reset saved_cb to NULL */ static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), void (**saved_cb)(struct sock *)) { if (!*saved_cb) return; *target_cb = *saved_cb; *saved_cb = NULL; } extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ #define SMC_SYSTEMID_LEN 8 extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ #define ntohll(x) be64_to_cpu(x) #define htonll(x) cpu_to_be64(x) /* convert an u32 value into network byte order, store it into a 3 byte field */ static inline void hton24(u8 *net, u32 host) { __be32 t; t = cpu_to_be32(host); memcpy(net, ((u8 *)&t) + 1, 3); } /* convert a received 3 byte field into host byte order*/ static inline u32 ntoh24(u8 *net) { __be32 t = 0; memcpy(((u8 *)&t) + 1, net, 3); return be32_to_cpu(t); } #ifdef CONFIG_XFRM static inline bool using_ipsec(struct smc_sock *smc) { return (smc->clcsock->sk->sk_policy[0] || smc->clcsock->sk->sk_policy[1]) ? true : false; } #else static inline bool using_ipsec(struct smc_sock *smc) { return false; } #endif struct smc_gidlist; struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid); /* smc handshake limitation interface for netlink */ int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) { set_bit(flag, &sk->sk_flags); } #endif /* __SMC_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RDS_RDS_H #define _RDS_RDS_H #include <net/sock.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <rdma/rdma_cm.h> #include <linux/mutex.h> #include <linux/rds.h> #include <linux/rhashtable.h> #include <linux/refcount.h> #include <linux/in6.h> #include "info.h" /* * RDS Network protocol version */ #define RDS_PROTOCOL_3_0 0x0300 #define RDS_PROTOCOL_3_1 0x0301 #define RDS_PROTOCOL_4_0 0x0400 #define RDS_PROTOCOL_4_1 0x0401 #define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) #define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1 /* The following ports, 16385, 18634, 18635, are registered with IANA as * the ports to be used for RDS over TCP and UDP. Currently, only RDS over * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept * to ensure compatibility with older RDS modules. Those ports are defined * in each transport's header file. */ #define RDS_PORT 18634 #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else /* sigh, pr_debug() causes unused variable warnings */ static inline __printf(1, 2) void rdsdebug(char *fmt, ...) { } #endif #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) /* Used to limit both RDMA and non-RDMA RDS message to 1MB */ #define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20)) #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) struct rds_cong_map { struct rb_node m_rb_node; struct in6_addr m_addr; wait_queue_head_t m_waitq; struct list_head m_conn_list; unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; }; /* * This is how we will track the connection state: * A connection is always in one of the following * states. Updates to the state are atomic and imply * a memory barrier. */ enum { RDS_CONN_DOWN = 0, RDS_CONN_CONNECTING, RDS_CONN_DISCONNECTING, RDS_CONN_UP, RDS_CONN_RESETTING, RDS_CONN_ERROR, }; /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 #define RDS_DESTROY_PENDING 4 /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 #define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ (rs)->rs_hash_initval) & ((n) - 1)) #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) /* Per mpath connection state */ struct rds_conn_path { struct rds_connection *cp_conn; struct rds_message *cp_xmit_rm; unsigned long cp_xmit_sg; unsigned int cp_xmit_hdr_off; unsigned int cp_xmit_data_off; unsigned int cp_xmit_atomic_sent; unsigned int cp_xmit_rdma_sent; unsigned int cp_xmit_data_sent; spinlock_t cp_lock; /* protect msg queues */ u64 cp_next_tx_seq; struct list_head cp_send_queue; struct list_head cp_retrans; u64 cp_next_rx_seq; void *cp_transport_data; atomic_t cp_state; unsigned long cp_send_gen; unsigned long cp_flags; unsigned long cp_reconnect_jiffies; struct delayed_work cp_send_w; struct delayed_work cp_recv_w; struct delayed_work cp_conn_w; struct work_struct cp_down_w; struct mutex cp_cm_lock; /* protect cp_state & cm */ wait_queue_head_t cp_waitq; unsigned int cp_unacked_packets; unsigned int cp_unacked_bytes; unsigned int cp_index; }; /* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; struct in6_addr c_laddr; struct in6_addr c_faddr; int c_dev_if; /* ifindex used for this conn */ int c_bound_if; /* ifindex of c_laddr */ unsigned int c_loopback:1, c_isv6:1, c_ping_triggered:1, c_pad_to_32:29; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; /* Protocol version */ unsigned int c_proposed_version; unsigned int c_version; possible_net_t c_net; /* TOS */ u8 c_tos; struct list_head c_map_item; unsigned long c_map_queued; struct rds_conn_path *c_path; wait_queue_head_t c_hs_waitq; /* handshake waitq */ u32 c_my_gen_num; u32 c_peer_gen_num; }; static inline struct net *rds_conn_net(struct rds_connection *conn) { return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 #define RDS_MAX_ADV_CREDIT 255 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping * probe to exchange control information before establishing a connection. * Currently the control information that is exchanged is the number of * supported paths. If the peer is a legacy (older kernel revision) peer, * it would return a pong message without additional control information * that would then alert the sender that the peer was an older rev. */ #define RDS_FLAG_PROBE_PORT 1 #define RDS_HS_PROBE(sport, dport) \ ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \ (sport == 0 && dport == RDS_FLAG_PROBE_PORT)) /* * Maximum space available for extension headers. */ #define RDS_HEADER_EXT_SPACE 16 struct rds_header { __be64 h_sequence; __be64 h_ack; __be32 h_len; __be16 h_sport; __be16 h_dport; u8 h_flags; u8 h_credit; u8 h_padding[4]; __sum16 h_csum; u8 h_exthdr[RDS_HEADER_EXT_SPACE]; }; /* * Reserved - indicates end of extensions */ #define RDS_EXTHDR_NONE 0 /* * This extension header is included in the very * first message that is sent on a new connection, * and identifies the protocol level. This will help * rolling updates if a future change requires breaking * the protocol. * NB: This is no longer true for IB, where we do a version * negotiation during the connection setup phase (protocol * version information is included in the RDMA CM private data). */ #define RDS_EXTHDR_VERSION 1 struct rds_ext_header_version { __be32 h_version; }; /* * This extension header is included in the RDS message * chasing an RDMA operation. */ #define RDS_EXTHDR_RDMA 2 struct rds_ext_header_rdma { __be32 h_rdma_rkey; }; /* * This extension header tells the peer about the * destination <R_Key,offset> of the requested RDMA * operation. */ #define RDS_EXTHDR_RDMA_DEST 3 struct rds_ext_header_rdma_dest { __be32 h_rdma_rkey; __be32 h_rdma_offset; }; /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ #define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) #define RDS_MSG_RX_HDR 0 #define RDS_MSG_RX_START 1 #define RDS_MSG_RX_END 2 #define RDS_MSG_RX_CMSG 3 /* The following values are whitelisted for usercopy */ struct rds_inc_usercopy { rds_rdma_cookie_t rdma_cookie; ktime_t rx_tstamp; }; struct rds_incoming { refcount_t i_refcount; struct list_head i_item; struct rds_connection *i_conn; struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; struct in6_addr i_saddr; struct rds_inc_usercopy i_usercopy; u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; struct rds_mr { struct rb_node r_rb_node; struct kref r_kref; u32 r_key; /* A copy of the creation flags */ unsigned int r_use_once:1; unsigned int r_invalidate:1; unsigned int r_write:1; struct rds_sock *r_sock; /* back pointer to the socket that owns us */ struct rds_transport *r_trans; void *r_trans_private; }; static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) { return r_key | (((u64) offset) << 32); } static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) { return cookie; } static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) { return cookie >> 32; } /* atomic operation types */ #define RDS_ATOMIC_TYPE_CSWP 0 #define RDS_ATOMIC_TYPE_FADD 1 /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty * the message will not be put back on the retransmit list after being sent. * messages that are canceled while being sent rely on this. * * m_inc is used by loopback so that it can pass an incoming message straight * back up into the rx path. It embeds a wire header which is also used by * the send path, which is kind of awkward. * * m_sock_item indicates the message's presence on a socket's send or receive * queue. m_rs will point to that socket. * * m_daddr is used by cancellation to prune messages to a given destination. * * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock * nesting. As paths iterate over messages on a sock, or conn, they must * also lock the conn, or sock, to remove the message from those lists too. * Testing the flag to determine if the message is still on the lists lets * us avoid testing the list_head directly. That means each path can use * the message's list_head to keep it on a local list while juggling locks * without confusing the other path. * * m_ack_seq is an optional field set by transports who need a different * sequence number range to invalidate. They can use this in a callback * that they pass to rds_send_drop_acked() to see if each message has been * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't * had ack_seq set yet. */ #define RDS_MSG_ON_SOCK 1 #define RDS_MSG_ON_CONN 2 #define RDS_MSG_HAS_ACK_SEQ 3 #define RDS_MSG_ACK_REQUIRED 4 #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 struct rds_znotifier { struct mmpin z_mmp; u32 z_cookie; }; struct rds_msg_zcopy_info { struct list_head rs_zcookie_next; union { struct rds_znotifier znotif; struct rds_zcopy_cookies zcookies; }; }; struct rds_msg_zcopy_queue { struct list_head zcookie_head; spinlock_t lock; /* protects zcookie_head queue */ }; static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) { spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->zcookie_head); } struct rds_iov_vector { struct rds_iovec *iov; int len; }; struct rds_iov_vector_arr { struct rds_iov_vector *vec; int len; int indx; int incr; }; struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; struct list_head m_conn_item; struct rds_incoming m_inc; u64 m_ack_seq; struct in6_addr m_daddr; unsigned long m_flags; /* Never access m_rs without holding m_rs_lock. * Lock nesting is * rm->m_rs_lock * -> rs->rs_lock */ spinlock_t m_rs_lock; wait_queue_head_t m_flush_wait; struct rds_sock *m_rs; /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; unsigned int m_used_sgs; unsigned int m_total_sgs; void *m_final_op; struct { struct rm_atomic_op { int op_type; union { struct { uint64_t compare; uint64_t swap; uint64_t compare_mask; uint64_t swap_mask; } op_m_cswp; struct { uint64_t add; uint64_t nocarry_mask; } op_m_fadd; }; u32 op_rkey; u64 op_remote_addr; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; } atomic; struct rm_rdma_op { u32 op_rkey; u64 op_remote_addr; unsigned int op_write:1; unsigned int op_fence:1; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; unsigned int op_bytes; unsigned int op_nents; unsigned int op_count; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; u64 op_odp_addr; struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; struct rds_conn_path *m_conn_path; }; /* * The RDS notifier is used (optionally) to tell the application about * completed RDMA operations. Rather than keeping the whole rds message * around on the queue, we allocate a small notifier that is put on the * socket's notifier_list. Notifications are delivered to the application * through control messages. */ struct rds_notifier { struct list_head n_list; uint64_t n_user_token; int n_status; }; /* Available as part of RDS core, so doesn't need to participate * in get_preferred transport etc */ #define RDS_TRANS_LOOP 3 /** * struct rds_transport - transport specific behavioural hooks * * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send * part of a message. The caller serializes on the send_sem so this * doesn't need to be reentrant for a given conn. The header must be * sent before the data payload. .xmit must be prepared to send a * message with no data payload. .xmit should return the number of * bytes that were sent down the connection, including header bytes. * Returning 0 tells the caller that it doesn't need to perform any * additional work now. This is usually the case when the transport has * filled the sending queue for its connection and will handle * triggering the rds thread to continue the send when space becomes * available. Returning -EAGAIN tells the caller to retry the send * immediately. Returning -ENOMEM tells the caller to retry the send at * some point in the future. * * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once * |